Skip to content
Snippets Groups Projects
tcp.c 58.1 KiB
Newer Older
/* -*- mode: c; c-basic-offset: 8; -*-
 *
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 *
 * ----
 *
 * Callers for this were originally written against a very simple synchronus
 * API.  This implementation reflects those simple callers.  Some day I'm sure
 * we'll need to move to a more robust posting/callback mechanism.
 *
 * Transmit calls pass in kernel virtual addresses and block copying this into
 * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
 * for a failed socket to timeout.  TX callers can also pass in a poniter to an
 * 'int' which gets filled with an errno off the wire in response to the
 * message they send.
 *
 * Handlers for unsolicited messages are registered.  Each socket has a page
 * that incoming data is copied into.  First the header, then the data.
 * Handlers are called from only one thread with a reference to this per-socket
 * page.  This page is destroyed after the handler call, so it can't be
 * referenced beyond the call.  Handlers may block but are discouraged from
 * doing so.
 *
 * Any framing errors (bad magic, large payload lengths) close a connection.
 *
 * Our sock_container holds the state we associate with a socket.  It's current
 * framing state is held there as well as the refcounting we do around when it
 * is safe to tear down the socket.  The socket is only finally torn down from
 * the container when the container loses all of its references -- so as long
 * as you hold a ref on the container you can trust that the socket is valid
 * for use with kernel socket APIs.
 *
 * Connections are initiated between a pair of nodes when the node with the
 * higher node number gets a heartbeat callback which indicates that the lower
 * numbered node has started heartbeating.  The lower numbered node is passive
 * and only accepts the connection if the higher numbered node is heartbeating.
 */

#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/net.h>
#include <net/tcp.h>

#include <asm/uaccess.h>

#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
#define MLOG_MASK_PREFIX ML_TCP
#include "masklog.h"
#include "quorum.h"

#include "tcp_internal.h"

#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
			  &sc->sc_node->nd_ipv4_address,		\
			  ntohs(sc->sc_node->nd_ipv4_port)

/*
 * In the following two log macros, the whitespace after the ',' just
 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
 * previous token if args expands to nothing.
 */
#define msglog(hdr, fmt, args...) do {					\
	typeof(hdr) __hdr = (hdr);					\
	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
	     "key %08x num %u] " fmt,					\
	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
} while (0)

#define sclog(sc, fmt, args...) do {					\
	typeof(sc) __sc = (sc);						\
	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
	     "pg_off %zu] " fmt, __sc,					\
	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
	    ##args);							\
} while (0)

static DEFINE_RWLOCK(o2net_handler_lock);
static struct rb_root o2net_handler_tree = RB_ROOT;

static struct o2net_node o2net_nodes[O2NM_MAX_NODES];

/* XXX someday we'll need better accounting */
static struct socket *o2net_listen_sock = NULL;

/*
 * listen work is only queued by the listening socket callbacks on the
 * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
 * quorum work is queued as sock containers are shutdown.. stop_listening
 * tears down all the node's sock containers, preventing future shutdowns
 * and queued quroum work, before canceling delayed quorum work and
 * destroying the work queue.
 */
static struct workqueue_struct *o2net_wq;
static struct work_struct o2net_listen_work;

static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
#define O2NET_HB_PRI 0x1

static struct o2net_handshake *o2net_hand;
static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;

static int o2net_sys_err_translations[O2NET_ERR_MAX] =
		{[O2NET_ERR_NONE]	= 0,
		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};

/* can't quite avoid *all* internal declarations :/ */
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
static void o2net_listen_data_ready(struct sock *sk, int bytes);
static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);

#ifdef CONFIG_DEBUG_FS
static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
			   u32 msgkey, struct task_struct *task, u8 node)
	INIT_LIST_HEAD(&nst->st_net_debug_item);
	nst->st_task = task;
	nst->st_msg_type = msgtype;
	nst->st_msg_key = msgkey;
	nst->st_node = node;
}

static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
	nst->st_status_time = ktime_get();
static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
						struct o2net_sock_container *sc)
static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
					u32 msg_id)
{
	nst->st_id = msg_id;
}
static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
	sc->sc_tv_advance_start = ktime_get();
static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
Loading
Loading full blame...