Newer
Older
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/*
* lowcomms.c
*
* This is the "low-level" comms layer.
*
* It is responsible for sending/receiving messages
* from other nodes in the cluster.
*
* Cluster nodes are referred to by their nodeids. nodeids are
* simply 32 bit numbers to the locking module - if they need to
* be expanded for the cluster infrastructure then that is it's
* responsibility. It is this layer's
* responsibility to resolve these into IP address or
* whatever it needs for inter-node communication.
*
* The comms level is two kernel threads that deal mainly with
* the receiving of messages from other nodes and passing them
* up to the mid-level comms layer (which understands the
* message format) for execution by the locking core, and
* a send thread which does all the setting up of connections
* to remote nodes and the sending of data. Threads are not allowed
* to send their own data because it may cause them to wait in times
* of high load. Also, this way, the sending thread can collect together
* messages bound for one node and send them in one block.
*
* lowcomms will choose to use wither TCP or SCTP as its transport layer
* depending on the configuration variable 'protocol'. This should be set
* to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
* cluster-wide mechanism as it must be the same on all nodes of the cluster
* for the DLM to function.
*
*/
#include <asm/ioctls.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/sctp.h>
#include <net/sctp/user.h>
#include "dlm_internal.h"
#include "lowcomms.h"
#include "midcomms.h"
#include "config.h"
#define NEEDED_RMEM (4*1024*1024)
unsigned int base;
unsigned int len;
unsigned int mask;
static void cbuf_add(struct cbuf *cb, int n)
{
cb->len += n;
}
static int cbuf_data(struct cbuf *cb)
{
return ((cb->base + cb->len) & cb->mask);
}
static void cbuf_init(struct cbuf *cb, int size)
{
cb->base = cb->len = 0;
cb->mask = size-1;
}
static void cbuf_eat(struct cbuf *cb, int n)
{
cb->len -= n;
cb->base += n;
cb->base &= cb->mask;
}
static bool cbuf_empty(struct cbuf *cb)
{
return cb->len == 0;
}
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
#define CF_READ_PENDING 1
#define CF_WRITE_PENDING 2
#define CF_CONNECT_PENDING 3
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
struct page *rx_page;
struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
/* An entry waiting to be sent */
struct writequeue_entry {
struct list_head list;
struct page *page;
int offset;
int len;
int end;
int users;
struct connection *con;
};
static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
/* Work queues */
static struct workqueue_struct *recv_workqueue;
static struct workqueue_struct *send_workqueue;
static DEFINE_IDR(connections_idr);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
/*
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
*/
static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
{
struct connection *con = NULL;
con = idr_find(&connections_idr, nodeid);
if (con || !alloc)
return con;
r = idr_pre_get(&connections_idr, alloc);
if (!r)
return NULL;
con = kmem_cache_zalloc(con_cache, alloc);
if (!con)
return NULL;
r = idr_get_new_above(&connections_idr, con, nodeid, &n);
if (r) {
kmem_cache_free(con_cache, con);
return NULL;
}
if (n != nodeid) {
idr_remove(&connections_idr, n);
kmem_cache_free(con_cache, con);
return NULL;
con->nodeid = nodeid;
mutex_init(&con->sock_mutex);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
/* Setup action pointers for child sockets */
if (con->nodeid) {
struct connection *zerocon = idr_find(&connections_idr, 0);
con->connect_action = zerocon->connect_action;
if (!con->rx_action)
con->rx_action = zerocon->rx_action;
if (nodeid > max_nodeid)
max_nodeid = nodeid;
return con;
}
static struct connection *nodeid2con(int nodeid, gfp_t allocation)
{
struct connection *con;
down(&connections_lock);
con = __nodeid2con(nodeid, allocation);
/* This is a bit drastic, but only called when things go wrong */
static struct connection *assoc2con(int assoc_id)
{
int i;
struct connection *con;
down(&connections_lock);
for (i=0; i<=max_nodeid; i++) {
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
con = __nodeid2con(i, 0);
if (con && con->sctp_assoc == assoc_id) {
up(&connections_lock);
return con;
}
}
up(&connections_lock);
return NULL;
}
static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
{
struct sockaddr_storage addr;
int error;
if (!dlm_local_count)
return -1;
error = dlm_nodeid_to_addr(nodeid, &addr);
if (error)
return error;
if (dlm_local_addr[0]->ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
} else {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
memcpy(&ret6->sin6_addr, &in6->sin6_addr,
sizeof(in6->sin6_addr));
}
return 0;
}
/* Data available on socket or listen socket received a connect */
static void lowcomms_data_ready(struct sock *sk, int count_unused)
{
struct connection *con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
queue_work(recv_workqueue, &con->rwork);
}
static void lowcomms_write_space(struct sock *sk)
{
struct connection *con = sock2con(sk);
if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
static inline void lowcomms_connect_sock(struct connection *con)
{
if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
static void lowcomms_state_change(struct sock *sk)
{
lowcomms_write_space(sk);
}
/* Make a socket active */
static int add_sock(struct socket *sock, struct connection *con)
{
con->sock = sock;
/* Install a data_ready callback */
con->sock->sk->sk_data_ready = lowcomms_data_ready;
con->sock->sk->sk_write_space = lowcomms_write_space;
con->sock->sk->sk_state_change = lowcomms_state_change;
con->sock->sk->sk_user_data = con;
/* Add the port number to an IPv6 or 4 sockaddr and return the address
length */
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
saddr->ss_family = dlm_local_addr[0]->ss_family;
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
*addr_len = sizeof(struct sockaddr_in);
memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
in6_addr->sin6_port = cpu_to_be16(port);
*addr_len = sizeof(struct sockaddr_in6);
}
memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
}
/* Close a remote connection and tidy up */
static void close_connection(struct connection *con, bool and_other)
if (con->sock) {
sock_release(con->sock);
con->sock = NULL;
}
if (con->othercon && and_other) {
/* Will only re-enter once. */
close_connection(con->othercon, false);
kmem_cache_free(con_cache, con->othercon);
con->othercon = NULL;
}
if (con->rx_page) {
__free_page(con->rx_page);
con->rx_page = NULL;
}
con->retries = 0;
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* We only send shutdown messages to nodes that are not part of the cluster */
static void sctp_send_shutdown(sctp_assoc_t associd)
{
static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
struct msghdr outmessage;
struct cmsghdr *cmsg;
struct sctp_sndrcvinfo *sinfo;
int ret;
struct connection *con;
con = nodeid2con(0,0);
BUG_ON(con == NULL);
outmessage.msg_name = NULL;
outmessage.msg_namelen = 0;
outmessage.msg_control = outcmsg;
outmessage.msg_controllen = sizeof(outcmsg);
outmessage.msg_flags = MSG_EOR;
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
cmsg->cmsg_type = SCTP_SNDRCV;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
outmessage.msg_controllen = cmsg->cmsg_len;
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
sinfo->sinfo_flags |= MSG_EOF;
sinfo->sinfo_assoc_id = associd;
ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
if (ret != 0)
log_print("send EOF to node failed: %d", ret);
}
/* INIT failed but we don't know which node...
restart INIT on all pending nodes */
static void sctp_init_failed(void)
{
int i;
struct connection *con;
down(&connections_lock);
for (i=1; i<=max_nodeid; i++) {
con = __nodeid2con(i, 0);
if (!con)
continue;
con->sctp_assoc = 0;
if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
queue_work(send_workqueue, &con->swork);
}
}
}
up(&connections_lock);
}
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
{
union sctp_notification *sn = (union sctp_notification *)buf;
if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
switch (sn->sn_assoc_change.sac_state) {
case SCTP_COMM_UP:
case SCTP_RESTART:
{
/* Check that the new node is in the lockspace */
struct sctp_prim prim;
int nodeid;
int prim_len, ret;
int addr_len;
struct connection *new_con;
struct file *file;
sctp_peeloff_arg_t parg;
int parglen = sizeof(parg);
/*
* We get this before any data for an association.
* We verify that the node is in the cluster and
* then peel off a socket for it.
*/
if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
log_print("COMM_UP for invalid assoc ID %d",
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
sctp_init_failed();
return;
}
memset(&prim, 0, sizeof(struct sctp_prim));
prim_len = sizeof(struct sctp_prim);
prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
ret = kernel_getsockopt(con->sock,
IPPROTO_SCTP,
SCTP_PRIMARY_ADDR,
(char*)&prim,
&prim_len);
if (ret < 0) {
log_print("getsockopt/sctp_primary_addr on "
"new assoc %d failed : %d",
(int)sn->sn_assoc_change.sac_assoc_id,
ret);
/* Retry INIT later */
new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
if (new_con)
clear_bit(CF_CONNECT_PENDING, &con->flags);
return;
}
make_sockaddr(&prim.ssp_addr, 0, &addr_len);
if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
int i;
unsigned char *b=(unsigned char *)&prim.ssp_addr;
log_print("reject connect from unknown addr");
for (i=0; i<sizeof(struct sockaddr_storage);i++)
printk("%02x ", b[i]);
printk("\n");
sctp_send_shutdown(prim.ssp_assoc_id);
return;
}
new_con = nodeid2con(nodeid, GFP_KERNEL);
if (!new_con)
return;
/* Peel off a new sock */
parg.associd = sn->sn_assoc_change.sac_assoc_id;
ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
SCTP_SOCKOPT_PEELOFF,
log_print("Can't peel off a socket for "
"connection %d to node %d: err=%d\n",
parg.associd, nodeid, ret);
}
file = fget(parg.sd);
new_con->sock = SOCKET_I(file->f_dentry->d_inode);
add_sock(new_con->sock, new_con);
fput(file);
put_unused_fd(parg.sd);
log_print("got new/restarted association %d nodeid %d",
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
clear_bit(CF_INIT_PENDING, &con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
queue_work(recv_workqueue, &new_con->rwork);
}
break;
case SCTP_COMM_LOST:
case SCTP_SHUTDOWN_COMP:
{
con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
if (con) {
con->sctp_assoc = 0;
}
}
break;
/* We don't know which INIT failed, so clear the PENDING flags
* on them all. if assoc_id is zero then it will then try
* again */
case SCTP_CANT_STR_ASSOC:
{
log_print("Can't start SCTP association - retrying");
sctp_init_failed();
}
break;
default:
log_print("unexpected SCTP assoc change id=%d state=%d",
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
}
}
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
int ret = 0;
struct msghdr msg = {};
struct kvec iov[2];
unsigned len;
int r;
int call_again_soon = 0;
char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
if (con->sock == NULL) {
ret = -EAGAIN;
goto out_close;
}
if (con->rx_page == NULL) {
/*
* This doesn't need to be atomic, but I think it should
* improve performance if it is.
*/
con->rx_page = alloc_page(GFP_ATOMIC);
if (con->rx_page == NULL)
goto out_resched;
/* Only SCTP needs these really */
memset(&incmsg, 0, sizeof(incmsg));
msg.msg_control = incmsg;
msg.msg_controllen = sizeof(incmsg);
/*
* iov[0] is the bit of the circular buffer between the current end
* point (cb.base + cb.len) and the end of the buffer.
*/
iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
/*
* iov[1] is the bit of the circular buffer between the start of the
* buffer and the start of the currently used section (cb.base)
*/
if (cbuf_data(&con->cb) >= con->cb.base) {
iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
iov[1].iov_len = con->cb.base;
iov[1].iov_base = page_address(con->rx_page);
}
len = iov[0].iov_len + iov[1].iov_len;
r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
MSG_DONTWAIT | MSG_NOSIGNAL);
if (ret <= 0)
goto out_close;
/* Process SCTP notifications */
if (msg.msg_flags & MSG_NOTIFICATION) {
msg.msg_control = incmsg;
msg.msg_controllen = sizeof(incmsg);
process_sctp_notification(con, &msg,
mutex_unlock(&con->sock_mutex);
return 0;
}
BUG_ON(con->nodeid == 0);
if (ret == len)
call_again_soon = 1;
ret = dlm_process_incoming_buffer(con->nodeid,
page_address(con->rx_page),
con->cb.base, con->cb.len,
PAGE_CACHE_SIZE);
if (ret == -EBADMSG) {
log_print("lowcomms: addr=%p, base=%u, len=%u, "
"iov_len=%u, iov_base[0]=%p, read=%d",
page_address(con->rx_page), con->cb.base, con->cb.len,
len, iov[0].iov_base, r);
}
if (ret < 0)
goto out_close;
if (cbuf_empty(&con->cb) && !call_again_soon) {
__free_page(con->rx_page);
con->rx_page = NULL;
}
if (call_again_soon)
goto out_resched;
if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
queue_work(recv_workqueue, &con->rwork);
if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
/* Reconnect when there is something to send */
}
/* Don't return success if we really got EOF */
if (ret == 0)
ret = -EAGAIN;
return ret;
}
/* Listening socket is busy, accept a connection */
static int tcp_accept_from_sock(struct connection *con)
{
int result;
struct sockaddr_storage peeraddr;
struct socket *newsock;
int len;
int nodeid;
struct connection *newcon;
memset(&peeraddr, 0, sizeof(peeraddr));
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
if (result < 0)
return -ENOMEM;
mutex_lock_nested(&con->sock_mutex, 0);
result = -ENOTCONN;
if (con->sock == NULL)
goto accept_err;
newsock->type = con->sock->type;
newsock->ops = con->sock->ops;
result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
if (result < 0)
goto accept_err;
/* Get the connected socket's peer */
memset(&peeraddr, 0, sizeof(peeraddr));
if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
&len, 2)) {
result = -ECONNABORTED;
goto accept_err;
}
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len);
if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
return -1;
}
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
* could happen if the two nodes initiate a connection at roughly
* the same time and the connections cross on the wire.
* In this case we store the incoming one in "othercon"
*/
newcon = nodeid2con(nodeid, GFP_KERNEL);
if (!newcon) {
result = -ENOMEM;
goto accept_err;
}
mutex_lock_nested(&newcon->sock_mutex, 1);
struct connection *othercon = newcon->othercon;
othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
result = -ENOMEM;
goto accept_err;
}
othercon->nodeid = nodeid;
othercon->rx_action = receive_from_sock;
INIT_WORK(&othercon->swork, process_send_sockets);
INIT_WORK(&othercon->rwork, process_recv_sockets);
set_bit(CF_IS_OTHERCON, &othercon->flags);
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
add_sock(newsock, othercon);
addcon = othercon;
}
else {
printk("Extra connection from node %d attempted\n", nodeid);
result = -EAGAIN;
}
}
else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
add_sock(newsock, newcon);
/*
* Add it to the active queue in case we got data
* beween processing the accept adding the socket
* to the read_sockets list
*/
if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
queue_work(recv_workqueue, &addcon->rwork);
sock_release(newsock);
if (result != -EAGAIN)
log_print("error accepting connection from node: %d", result);
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
static void free_entry(struct writequeue_entry *e)
{
__free_page(e->page);
kfree(e);
}
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
and add the primary IP address of the remote node.
*/
static void sctp_init_assoc(struct connection *con)
{
struct sockaddr_storage rem_addr;
char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
struct msghdr outmessage;
struct cmsghdr *cmsg;
struct sctp_sndrcvinfo *sinfo;
struct connection *base_con;
struct writequeue_entry *e;
int len, offset;
int ret;
int addrlen;
struct kvec iov[1];
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
return;
if (con->retries++ > MAX_CONNECT_RETRIES)
return;
log_print("Initiating association with node %d", con->nodeid);
if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
log_print("no address for nodeid %d", con->nodeid);
return;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
outmessage.msg_name = &rem_addr;
outmessage.msg_namelen = addrlen;
outmessage.msg_control = outcmsg;
outmessage.msg_controllen = sizeof(outcmsg);
outmessage.msg_flags = MSG_EOR;
spin_lock(&con->writequeue_lock);
e = list_entry(con->writequeue.next, struct writequeue_entry,
list);
BUG_ON((struct list_head *) e == &con->writequeue);
len = e->len;
offset = e->offset;
spin_unlock(&con->writequeue_lock);
kmap(e->page);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
cmsg->cmsg_type = SCTP_SNDRCV;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
outmessage.msg_controllen = cmsg->cmsg_len;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
log_print("Send first packet to node %d failed: %d",
con->nodeid, ret);
/* Try again later */
clear_bit(CF_CONNECT_PENDING, &con->flags);
clear_bit(CF_INIT_PENDING, &con->flags);
}
else {
spin_lock(&con->writequeue_lock);
e->offset += ret;
e->len -= ret;
if (e->len == 0 && e->users == 0) {
list_del(&e->list);
kunmap(e->page);
free_entry(e);
}
spin_unlock(&con->writequeue_lock);
}
}
/* Connect a new socket to its peer */
static void tcp_connect_to_sock(struct connection *con)
{
int result = -EHOSTUNREACH;
struct sockaddr_storage saddr;
int addr_len;
struct socket *sock;
if (con->nodeid == 0) {
log_print("attempt to connect sock 0 foiled");
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
/* Some odd races can cause double-connects, ignore them */
if (con->sock) {
result = 0;
goto out;
}
/* Create a socket to communicate with */
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
if (result < 0)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
if (dlm_nodeid_to_addr(con->nodeid, &saddr))
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = tcp_connect_to_sock;
add_sock(sock, con);
make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
log_print("connecting to %d", con->nodeid);
result =
sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
if (result == -EINPROGRESS)
result = 0;
if (con->sock) {
sock_release(con->sock);
con->sock = NULL;
}
/*
* Some errors are fatal and this list might need adjusting. For other
* errors we try again until the max number of retries is reached.
*/
if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
result != -ENETDOWN && result != EINVAL
&& result != -EPROTONOSUPPORT) {
lowcomms_connect_sock(con);
result = 0;
}
static struct socket *tcp_create_listen_sock(struct connection *con,
struct sockaddr_storage *saddr)
int result = 0;
int one = 1;
int addr_len;
if (dlm_local_addr[0]->ss_family == AF_INET)
addr_len = sizeof(struct sockaddr_in);
else
addr_len = sizeof(struct sockaddr_in6);
/* Create a socket to communicate with */
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
IPPROTO_TCP, &sock);
result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
(char *)&one, sizeof(one));
log_print("Failed to set SO_REUSEADDR on socket: %d", result);
}
sock->sk->sk_user_data = con;
con->rx_action = tcp_accept_from_sock;
con->connect_action = tcp_connect_to_sock;
con->sock = sock;
/* Bind to our port */
make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
if (result < 0) {
log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
sock_release(sock);
sock = NULL;
con->sock = NULL;
goto create_out;
}
result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
}
result = sock->ops->listen(sock, 5);
if (result < 0) {
log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
sock_release(sock);
sock = NULL;
goto create_out;
}
/* Get local addresses */
static void init_local(void)
{
struct sockaddr_storage sas, *addr;
int i;
dlm_local_count = 0;
for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
if (dlm_our_addr(&sas, i))
break;