tlb_uv.c

/*
 *	SGI UltraViolet TLB flush routines.
 *
 *	(c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 */
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/delay.h>

#include <asm/mmu_context.h>
#include <asm/uv/uv.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
#include <asm/timer.h>

/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
		20,
		160,
		1280,
		10240,
		81920,
		655360,
		5242880,
		167772160
};

static int timeout_us;
static int nobau;
static int nobau_perm;
static cycles_t congested_cycles;

/* tunables: */
static int max_concurr		= MAX_BAU_CONCURRENT;
static int max_concurr_const	= MAX_BAU_CONCURRENT;
static int plugged_delay	= PLUGGED_DELAY;
static int plugsb4reset		= PLUGSB4RESET;
static int giveup_limit		= GIVEUP_LIMIT;
static int timeoutsb4reset	= TIMEOUTSB4RESET;
static int ipi_reset_limit	= IPI_RESET_LIMIT;
static int complete_threshold	= COMPLETE_THRESHOLD;
static int congested_respns_us	= CONGESTED_RESPONSE_US;
static int congested_reps	= CONGESTED_REPS;
static int disabled_period	= DISABLED_PERIOD;

static struct tunables tunables[] = {
	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
	{&plugged_delay, PLUGGED_DELAY},
	{&plugsb4reset, PLUGSB4RESET},
	{&timeoutsb4reset, TIMEOUTSB4RESET},
	{&ipi_reset_limit, IPI_RESET_LIMIT},
	{&complete_threshold, COMPLETE_THRESHOLD},
	{&congested_respns_us, CONGESTED_RESPONSE_US},
	{&congested_reps, CONGESTED_REPS},
	{&disabled_period, DISABLED_PERIOD},
	{&giveup_limit, GIVEUP_LIMIT}
};

static struct dentry *tunables_dir;
static struct dentry *tunables_file;

/* these correspond to the statistics printed by ptc_seq_show() */
static char *stat_description[] = {
	"sent:     number of shootdown messages sent",
	"stime:    time spent sending messages",
	"numuvhubs: number of hubs targeted with shootdown",
	"numuvhubs16: number times 16 or more hubs targeted",
	"numuvhubs8: number times 8 or more hubs targeted",
	"numuvhubs4: number times 4 or more hubs targeted",
	"numuvhubs2: number times 2 or more hubs targeted",
	"numuvhubs1: number times 1 hub targeted",
	"numcpus:  number of cpus targeted with shootdown",
	"dto:      number of destination timeouts",
	"retries:  destination timeout retries sent",
	"rok:   :  destination timeouts successfully retried",
	"resetp:   ipi-style resource resets for plugs",
	"resett:   ipi-style resource resets for timeouts",
	"giveup:   fall-backs to ipi-style shootdowns",
	"sto:      number of source timeouts",
	"bz:       number of stay-busy's",
	"throt:    number times spun in throttle",
	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
	"recv:     shootdown messages received",
	"rtime:    time spent processing messages",
	"all:      shootdown all-tlb messages",
	"one:      shootdown one-tlb messages",
	"mult:     interrupts that found multiple messages",
	"none:     interrupts that found no messages",
	"retry:    number of retry messages processed",
	"canc:     number messages canceled by retries",
	"nocan:    number retries that found nothing to cancel",
	"reset:    number of ipi-style reset requests processed",
	"rcan:     number messages canceled by reset requests",
	"disable:  number times use of the BAU was disabled",
	"enable:   number times use of the BAU was re-enabled"
};

static int __init
setup_nobau(char *arg)
{
	nobau = 1;
	return 0;
}
early_param("nobau", setup_nobau);

/* base pnode in this partition */
static int uv_base_pnode __read_mostly;

static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);

static void
set_bau_on(void)
{
	int cpu;
	struct bau_control *bcp;

	if (nobau_perm) {
		pr_info("BAU not initialized; cannot be turned on\n");
		return;
	}
	nobau = 0;
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->nobau = 0;
	}
	pr_info("BAU turned on\n");
	return;
}

static void
set_bau_off(void)
{
	int cpu;
	struct bau_control *bcp;

	nobau = 1;
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->nobau = 1;
	}
	pr_info("BAU turned off\n");
	return;
}

/*
 * Determine the first node on a uvhub. 'Nodes' are used for kernel
 * memory allocation.
 */
static int __init uvhub_to_first_node(int uvhub)
{
	int node, b;

	for_each_online_node(node) {
		b = uv_node_to_blade_id(node);
		if (uvhub == b)
			return node;
	}
	return -1;
}

/*
 * Determine the apicid of the first cpu on a uvhub.
 */
static int __init uvhub_to_first_apicid(int uvhub)
{
	int cpu;

	for_each_present_cpu(cpu)
		if (uvhub == uv_cpu_to_blade_id(cpu))
			return per_cpu(x86_cpu_to_apicid, cpu);
	return -1;
}

/*
 * Free a software acknowledge hardware resource by clearing its Pending
 * bit. This will return a reply to the sender.
 * If the message has timed out, a reply has already been sent by the
 * hardware but the resource has not been released. In that case our
 * clear of the Timeout bit (as well) will free the resource. No reply will
 * be sent (the hardware will only do one reply per message).
 */
static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
{
	unsigned long dw;
	struct bau_pq_entry *msg;

	msg = mdp->msg;
	if (!msg->canceled && do_acknowledge) {
		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
		write_mmr_sw_ack(dw);
	}
	msg->replied_to = 1;
	msg->swack_vec = 0;
}

/*
 * Process the receipt of a RETRY message
 */
static void bau_process_retry_msg(struct msg_desc *mdp,
					struct bau_control *bcp)
{
	int i;
	int cancel_count = 0;
	unsigned long msg_res;
	unsigned long mmr = 0;
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *msg2;
	struct ptc_stats *stat = bcp->statp;

	stat->d_retries++;
	/*
	 * cancel any message from msg+1 to the retry itself
	 */
	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
		if (msg2 > mdp->queue_last)
			msg2 = mdp->queue_first;
		if (msg2 == msg)
			break;

		/* same conditions for cancellation as do_reset */
		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
		    (msg2->swack_vec) && ((msg2->swack_vec &
			msg->swack_vec) == 0) &&
		    (msg2->sending_cpu == msg->sending_cpu) &&
		    (msg2->msg_type != MSG_NOOP)) {
			mmr = read_mmr_sw_ack();
			msg_res = msg2->swack_vec;
			/*
			 * This is a message retry; clear the resources held
			 * by the previous message only if they timed out.
			 * If it has not timed out we have an unexpected
			 * situation to report.
			 */
			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
				unsigned long mr;
				/*
				 * Is the resource timed out?
				 * Make everyone ignore the cancelled message.
				 */
				msg2->canceled = 1;
				stat->d_canceled++;
				cancel_count++;
				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
				write_mmr_sw_ack(mr);
			}
		}
	}
	if (!cancel_count)
		stat->d_nocanceled++;
}

/*
 * Do all the things a cpu should do for a TLB shootdown message.
 * Other cpu's may come here at the same time for this message.
 */
static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
{
	short socket_ack_count = 0;
	short *sp;
	struct atomic_short *asp;
	struct ptc_stats *stat = bcp->statp;
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_control *smaster = bcp->socket_master;

	/*
	 * This must be a normal message, or retry of a normal message
	 */
	if (msg->address == TLB_FLUSH_ALL) {
		local_flush_tlb();
		stat->d_alltlb++;
	} else {
		__flush_tlb_one(msg->address);
		stat->d_onetlb++;
	}
	stat->d_requestee++;

	/*
	 * One cpu on each uvhub has the additional job on a RETRY
	 * of releasing the resource held by the message that is
	 * being retried.  That message is identified by sending
	 * cpu number.
	 */
	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
		bau_process_retry_msg(mdp, bcp);

	/*
	 * This is a swack message, so we have to reply to it.
	 * Count each responding cpu on the socket. This avoids
	 * pinging the count's cache line back and forth between
	 * the sockets.
	 */
	sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
	asp = (struct atomic_short *)sp;
	socket_ack_count = atom_asr(1, asp);
	if (socket_ack_count == bcp->cpus_in_socket) {
		int msg_ack_count;
		/*
		 * Both sockets dump their completed count total into
		 * the message's count.
		 */
		*sp = 0;
		asp = (struct atomic_short *)&msg->acknowledge_count;
		msg_ack_count = atom_asr(socket_ack_count, asp);

		if (msg_ack_count == bcp->cpus_in_uvhub) {
			/*
			 * All cpus in uvhub saw it; reply
			 * (unless we are in the UV2 workaround)
			 */
			reply_to_message(mdp, bcp, do_acknowledge);
		}
	}

	return;
}

/*
 * Determine the first cpu on a pnode.
 */
static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
{
	int cpu;
	struct hub_and_pnode *hpp;

	for_each_present_cpu(cpu) {
		hpp = &smaster->thp[cpu];
		if (pnode == hpp->pnode)
			return cpu;
	}
	return -1;
}

/*
 * Last resort when we get a large number of destination timeouts is
 * to clear resources held by a given cpu.
 * Do this with IPI so that all messages in the BAU message queue
 * can be identified by their nonzero swack_vec field.
 *
 * This is entered for a single cpu on the uvhub.
 * The sender want's this uvhub to free a specific message's
 * swack resources.
 */
static void do_reset(void *ptr)
{
	int i;
	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
	struct reset_args *rap = (struct reset_args *)ptr;
	struct bau_pq_entry *msg;
	struct ptc_stats *stat = bcp->statp;

	stat->d_resets++;
	/*
	 * We're looking for the given sender, and
	 * will free its swack resource.
	 * If all cpu's finally responded after the timeout, its
	 * message 'replied_to' was set.
	 */
	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
		unsigned long msg_res;
		/* do_reset: same conditions for cancellation as
		   bau_process_retry_msg() */
		if ((msg->replied_to == 0) &&
		    (msg->canceled == 0) &&
		    (msg->sending_cpu == rap->sender) &&
		    (msg->swack_vec) &&
		    (msg->msg_type != MSG_NOOP)) {
			unsigned long mmr;
			unsigned long mr;
			/*
			 * make everyone else ignore this message
			 */
			msg->canceled = 1;
			/*
			 * only reset the resource if it is still pending
			 */
			mmr = read_mmr_sw_ack();
			msg_res = msg->swack_vec;
			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
			if (mmr & msg_res) {
				stat->d_rcanceled++;
				write_mmr_sw_ack(mr);
			}
		}
	}
	return;
}

/*
 * Use IPI to get all target uvhubs to release resources held by
 * a given sending cpu number.
 */
static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
{
	int pnode;
	int apnode;
	int maskbits;
	int sender = bcp->cpu;
	cpumask_t *mask = bcp->uvhub_master->cpumask;
	struct bau_control *smaster = bcp->socket_master;
	struct reset_args reset_args;

	reset_args.sender = sender;
	cpus_clear(*mask);
	/* find a single cpu for each uvhub in this distribution mask */
	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
	/* each bit is a pnode relative to the partition base pnode */
	for (pnode = 0; pnode < maskbits; pnode++) {
		int cpu;
		if (!bau_uvhub_isset(pnode, distribution))
			continue;
		apnode = pnode + bcp->partition_base_pnode;
		cpu = pnode_to_first_cpu(apnode, smaster);
		cpu_set(cpu, *mask);
	}

	/* IPI all cpus; preemption is already disabled */
	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
	return;
}

/*
 * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
 * number, not an absolute. It converts a duration in cycles to a duration in
 * ns.
 */
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
	struct cyc2ns_data *data = cyc2ns_read_begin();
	unsigned long long ns;

	ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);

	cyc2ns_read_end(data);
	return ns;
}

/*
 * The reverse of the above; converts a duration in ns to a duration in cycles.
 */ 
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
	struct cyc2ns_data *data = cyc2ns_read_begin();
	unsigned long long cyc;

	cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;

	cyc2ns_read_end(data);
	return cyc;
}

static inline unsigned long cycles_2_us(unsigned long long cyc)
{
	return cycles_2_ns(cyc) / NSEC_PER_USEC;
}

static inline cycles_t sec_2_cycles(unsigned long sec)
{
	return ns_2_cycles(sec * NSEC_PER_SEC);
}

static inline unsigned long long usec_2_cycles(unsigned long usec)
{
	return ns_2_cycles(usec * NSEC_PER_USEC);
}

/*
 * wait for all cpus on this hub to finish their sends and go quiet
 * leaves uvhub_quiesce set so that no new broadcasts are started by
 * bau_flush_send_and_wait()
 */
static inline void quiesce_local_uvhub(struct bau_control *hmaster)
{
	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}

/*
 * mark this quiet-requestor as done
 */
static inline void end_uvhub_quiesce(struct bau_control *hmaster)
{
	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}

static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
{
	unsigned long descriptor_status;

	descriptor_status = uv_read_local_mmr(mmr_offset);
	descriptor_status >>= right_shift;
	descriptor_status &= UV_ACT_STATUS_MASK;
	return descriptor_status;
}

/*
 * Wait for completion of a broadcast software ack message
 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
 */
static int uv1_wait_completion(struct bau_desc *bau_desc,
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
{
	unsigned long descriptor_status;
	cycles_t ttm;
	struct ptc_stats *stat = bcp->statp;

	descriptor_status = uv1_read_status(mmr_offset, right_shift);
	/* spin on the status MMR, waiting for it to go idle */
	while ((descriptor_status != DS_IDLE)) {
		/*
		 * Our software ack messages may be blocked because
		 * there are no swack resources available.  As long
		 * as none of them has timed out hardware will NACK
		 * our message and its state will stay IDLE.
		 */
		if (descriptor_status == DS_SOURCE_TIMEOUT) {
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
			stat->s_dtimeout++;
			ttm = get_cycles();

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 */
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
				bcp->conseccompletes = 0;
				return FLUSH_RETRY_PLUGGED;
			}

			bcp->conseccompletes = 0;
			return FLUSH_RETRY_TIMEOUT;
		} else {
			/*
			 * descriptor_status is still BUSY
			 */
			cpu_relax();
		}
		descriptor_status = uv1_read_status(mmr_offset, right_shift);
	}
	bcp->conseccompletes++;
	return FLUSH_COMPLETE;
}

/*
 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
 * But not currently used.
 */
static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
{
	unsigned long descriptor_status;

	descriptor_status =
		((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
	return descriptor_status;
}

/*
 * Return whether the status of the descriptor that is normally used for this
 * cpu (the one indexed by its hub-relative cpu number) is busy.
 * The status of the original 32 descriptors is always reflected in the 64
 * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
 * The bit provided by the activation_status_2 register is irrelevant to
 * the status if it is only being tested for busy or not busy.
 */
int normal_busy(struct bau_control *bcp)
{
	int cpu = bcp->uvhub_cpu;
	int mmr_offset;
	int right_shift;

	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
	right_shift = cpu * UV_ACT_STATUS_SIZE;
	return (((((read_lmmr(mmr_offset) >> right_shift) &
				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
}

/*
 * Entered when a bau descriptor has gone into a permanent busy wait because
 * of a hardware bug.
 * Workaround the bug.
 */
int handle_uv2_busy(struct bau_control *bcp)
{
	struct ptc_stats *stat = bcp->statp;

	stat->s_uv2_wars++;
	bcp->busy = 1;
	return FLUSH_GIVEUP;
}

static int uv2_wait_completion(struct bau_desc *bau_desc,
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
{
	unsigned long descriptor_stat;
	cycles_t ttm;
	int desc = bcp->uvhub_cpu;
	long busy_reps = 0;
	struct ptc_stats *stat = bcp->statp;

	descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);

	/* spin on the status MMR, waiting for it to go idle */
	while (descriptor_stat != UV2H_DESC_IDLE) {
		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
			/*
			 * A h/w bug on the destination side may
			 * have prevented the message being marked
			 * pending, thus it doesn't get replied to
			 * and gets continually nacked until it times
			 * out with a SOURCE_TIMEOUT.
			 */
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
			ttm = get_cycles();

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 * Without using the extended status we have to
			 * deduce from the short time that this was a
			 * strong nack.
			 */
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
				bcp->conseccompletes = 0;
				stat->s_plugged++;
				/* FLUSH_RETRY_PLUGGED causes hang on boot */
				return FLUSH_GIVEUP;
			}
			stat->s_dtimeout++;
			bcp->conseccompletes = 0;
			/* FLUSH_RETRY_TIMEOUT causes hang on boot */
			return FLUSH_GIVEUP;
		} else {
			busy_reps++;
			if (busy_reps > 1000000) {
				/* not to hammer on the clock */
				busy_reps = 0;
				ttm = get_cycles();
				if ((ttm - bcp->send_message) >
						bcp->timeout_interval)
					return handle_uv2_busy(bcp);
			}
			/*
			 * descriptor_stat is still BUSY
			 */
			cpu_relax();
		}
		descriptor_stat = uv2_read_status(mmr_offset, right_shift,
									desc);
	}
	bcp->conseccompletes++;
	return FLUSH_COMPLETE;
}

/*
 * There are 2 status registers; each and array[32] of 2 bits. Set up for
 * which register to read and position in that register based on cpu in
 * current hub.
 */
static int wait_completion(struct bau_desc *bau_desc,
				struct bau_control *bcp, long try)
{
	int right_shift;
	unsigned long mmr_offset;
	int desc = bcp->uvhub_cpu;

	if (desc < UV_CPUS_PER_AS) {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
		right_shift = desc * UV_ACT_STATUS_SIZE;
	} else {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
	}

	if (bcp->uvhub_version == 1)
		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
								bcp, try);
	else
		return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
								bcp, try);
}

/*
 * Our retries are blocked by all destination sw ack resources being
 * in use, and a timeout is pending. In that case hardware immediately
 * returns the ERROR that looks like a destination timeout.
 */
static void destination_plugged(struct bau_desc *bau_desc,
			struct bau_control *bcp,
			struct bau_control *hmaster, struct ptc_stats *stat)
{
	udelay(bcp->plugged_delay);
	bcp->plugged_tries++;

	if (bcp->plugged_tries >= bcp->plugsb4reset) {
		bcp->plugged_tries = 0;

		quiesce_local_uvhub(hmaster);

		spin_lock(&hmaster->queue_lock);
		reset_with_ipi(&bau_desc->distribution, bcp);
		spin_unlock(&hmaster->queue_lock);

		end_uvhub_quiesce(hmaster);

		bcp->ipi_attempts++;
		stat->s_resets_plug++;
	}
}

static void destination_timeout(struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
{
	hmaster->max_concurr = 1;
	bcp->timeout_tries++;
	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
		bcp->timeout_tries = 0;

		quiesce_local_uvhub(hmaster);

		spin_lock(&hmaster->queue_lock);
		reset_with_ipi(&bau_desc->distribution, bcp);
		spin_unlock(&hmaster->queue_lock);

		end_uvhub_quiesce(hmaster);

		bcp->ipi_attempts++;
		stat->s_resets_timeout++;
	}
}

/*
 * Stop all cpus on a uvhub from using the BAU for a period of time.
 * This is reversed by check_enable.
 */
static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
{
	int tcpu;
	struct bau_control *tbcp;
	struct bau_control *hmaster;
	cycles_t tm1;

	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (!bcp->baudisabled) {
		stat->s_bau_disabled++;
		tm1 = get_cycles();
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
			if (tbcp->uvhub_master == hmaster) {
				tbcp->baudisabled = 1;
				tbcp->set_bau_on_time =
					tm1 + bcp->disabled_period;
			}
		}
	}
	spin_unlock(&hmaster->disable_lock);
}

static void count_max_concurr(int stat, struct bau_control *bcp,
				struct bau_control *hmaster)
{
	bcp->plugged_tries = 0;
	bcp->timeout_tries = 0;
	if (stat != FLUSH_COMPLETE)
		return;
	if (bcp->conseccompletes <= bcp->complete_threshold)
		return;
	if (hmaster->max_concurr >= hmaster->max_concurr_const)
		return;
	hmaster->max_concurr++;
}

static void record_send_stats(cycles_t time1, cycles_t time2,
		struct bau_control *bcp, struct ptc_stats *stat,
		int completion_status, int try)
{
	cycles_t elapsed;

	if (time2 > time1) {
		elapsed = time2 - time1;
		stat->s_time += elapsed;

		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
			bcp->period_requests++;
			bcp->period_time += elapsed;
			if ((elapsed > congested_cycles) &&
			    (bcp->period_requests > bcp->cong_reps) &&
			    ((bcp->period_time / bcp->period_requests) >
							congested_cycles)) {
				stat->s_congested++;
				disable_for_period(bcp, stat);
			}
		}
	} else
		stat->s_requestor--;

	if (completion_status == FLUSH_COMPLETE && try > 1)
		stat->s_retriesok++;
	else if (completion_status == FLUSH_GIVEUP) {
		stat->s_giveup++;
		if (get_cycles() > bcp->period_end)
			bcp->period_giveups = 0;
		bcp->period_giveups++;
		if (bcp->period_giveups == 1)
			bcp->period_end = get_cycles() + bcp->disabled_period;
		if (bcp->period_giveups > bcp->giveup_limit) {
			disable_for_period(bcp, stat);
			stat->s_giveuplimit++;
		}
	}
}

/*
 * Because of a uv1 hardware bug only a limited number of concurrent
 * requests can be made.
 */
static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
{
	spinlock_t *lock = &hmaster->uvhub_lock;
	atomic_t *v;

	v = &hmaster->active_descriptor_count;
	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
		stat->s_throttles++;
		do {
			cpu_relax();
		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
	}
}

/*
 * Handle the completion status of a message send.
 */
static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
{
	if (completion_status == FLUSH_RETRY_PLUGGED)
		destination_plugged(bau_desc, bcp, hmaster, stat);
	else if (completion_status == FLUSH_RETRY_TIMEOUT)
		destination_timeout(bau_desc, bcp, hmaster, stat);
}

/*
 * Send a broadcast and wait for it to complete.
 *
 * The flush_mask contains the cpus the broadcast is to be sent to including
 * cpus that are on the local uvhub.
 *
 * Returns 0 if all flushing represented in the mask was done.
 * Returns 1 if it gives up entirely and the original cpu mask is to be
 * returned to the kernel.
 */
int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
	struct bau_desc *bau_desc)
{
	int seq_number = 0;
	int completion_stat = 0;
	int uv1 = 0;
	long try = 0;
	unsigned long index;
	cycles_t time1;
	cycles_t time2;
	struct ptc_stats *stat = bcp->statp;
	struct bau_control *hmaster = bcp->uvhub_master;
	struct uv1_bau_msg_header *uv1_hdr = NULL;
	struct uv2_bau_msg_header *uv2_hdr = NULL;

	if (bcp->uvhub_version == 1) {
		uv1 = 1;
		uv1_throttle(hmaster, stat);
	}

	while (hmaster->uvhub_quiesce)
		cpu_relax();

	time1 = get_cycles();
	if (uv1)
		uv1_hdr = &bau_desc->header.uv1_hdr;
	else
		uv2_hdr = &bau_desc->header.uv2_hdr;

	do {
		if (try == 0) {
			if (uv1)
				uv1_hdr->msg_type = MSG_REGULAR;
			else
				uv2_hdr->msg_type = MSG_REGULAR;
			seq_number = bcp->message_number++;
		} else {
			if (uv1)
				uv1_hdr->msg_type = MSG_RETRY;
			else
				uv2_hdr->msg_type = MSG_RETRY;
			stat->s_retry_messages++;
		}

		if (uv1)
			uv1_hdr->sequence = seq_number;
		else
			uv2_hdr->sequence = seq_number;
		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
		bcp->send_message = get_cycles();

		write_mmr_activation(index);

		try++;
		completion_stat = wait_completion(bau_desc, bcp, try);

		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);

		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
			bcp->ipi_attempts = 0;
			stat->s_overipilimit++;
			completion_stat = FLUSH_GIVEUP;
			break;
		}
		cpu_relax();
	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
		 (completion_stat == FLUSH_RETRY_TIMEOUT));

	time2 = get_cycles();

	count_max_concurr(completion_stat, bcp, hmaster);

	while (hmaster->uvhub_quiesce)
		cpu_relax();

	atomic_dec(&hmaster->active_descriptor_count);

	record_send_stats(time1, time2, bcp, stat, completion_stat, try);

	if (completion_stat == FLUSH_GIVEUP)
		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
		return 1;
	return 0;
}

/*
 * The BAU is disabled for this uvhub. When the disabled time period has
 * expired re-enable it.
 * Return 0 if it is re-enabled for all cpus on this uvhub.
 */
static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
{
	int tcpu;
	struct bau_control *tbcp;
	struct bau_control *hmaster;

	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
		stat->s_bau_reenabled++;
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
			if (tbcp->uvhub_master == hmaster) {
				tbcp->baudisabled = 0;
				tbcp->period_requests = 0;
				tbcp->period_time = 0;
				tbcp->period_giveups = 0;
			}
		}
		spin_unlock(&hmaster->disable_lock);
		return 0;
	}
	spin_unlock(&hmaster->disable_lock);
	return -1;
}

static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
				int remotes, struct bau_desc *bau_desc)
{
	stat->s_requestor++;
	stat->s_ntargcpu += remotes + locals;
	stat->s_ntargremotes += remotes;
	stat->s_ntarglocals += locals;