Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmdomain.c
*
* defines domain join / leave apis
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/err.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#include "dlmver.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
static void dlm_free_pagevec(void **vec, int pages)
{
while (pages--)
free_page((unsigned long)vec[pages]);
kfree(vec);
}
static void **dlm_alloc_pagevec(int pages)
{
void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
int i;
if (!vec)
return NULL;
for (i = 0; i < pages; i++)
if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
goto out_free;
mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
pages, (unsigned long)DLM_HASH_PAGES,
(unsigned long)DLM_BUCKETS_PER_PAGE);
return vec;
out_free:
dlm_free_pagevec(vec, i);
return NULL;
}
/*
*
* spinlock lock ordering: if multiple locks are needed, obey this ordering:
* dlm_domain_lock
* struct dlm_ctxt->spinlock
* struct dlm_lock_resource->spinlock
* struct dlm_ctxt->master_lock
* struct dlm_ctxt->ast_lock
* dlm_master_list_entry->spinlock
* dlm_lock->spinlock
*
*/
LIST_HEAD(dlm_domains);
static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
#define DLM_DOMAIN_BACKOFF_MS 200
static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
{
hlist_del_init(&lockres->hash_node);
dlm_lockres_put(lockres);
}
void __dlm_insert_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct qstr *q;
assert_spin_locked(&dlm->spinlock);
q = &res->lockname;
bucket = dlm_lockres_hash(dlm, q->hash);
/* get a reference for our hashtable */
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
unsigned int hash)
struct hlist_node *list;
mlog_entry("%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
bucket = dlm_lockres_hash(dlm, hash);
hlist_for_each(list, bucket) {
struct dlm_lock_resource *res = hlist_entry(list,
struct dlm_lock_resource, hash_node);
if (res->lockname.name[0] != name[0])
continue;
if (unlikely(res->lockname.len != len))
continue;
if (memcmp(res->lockname.name + 1, name + 1, len - 1))
continue;
dlm_lockres_get(res);
return res;
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* intended to be called by functions which do not care about lock
* resources which are being purged (most net _handler functions).
* this will return NULL for any lock resource which is found but
* currently in the process of dropping its mastery reference.
* use __dlm_lookup_lockres_full when you need the lock resource
* regardless (e.g. dlm_get_lock_resource) */
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
unsigned int hash)
{
struct dlm_lock_resource *res = NULL;
mlog_entry("%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
res = __dlm_lookup_lockres_full(dlm, name, len, hash);
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
return NULL;
}
spin_unlock(&res->spinlock);
}
return res;
}
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len)
{
struct dlm_lock_resource *res;
unsigned int hash = dlm_lockid_hash(name, len);
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, len, hash);
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
spin_unlock(&dlm->spinlock);
return res;
}
static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
{
struct dlm_ctxt *tmp = NULL;
struct list_head *iter;
assert_spin_locked(&dlm_domain_lock);
/* tmp->name here is always NULL terminated,
* but domain may not be! */
list_for_each(iter, &dlm_domains) {
tmp = list_entry (iter, struct dlm_ctxt, list);
if (strlen(tmp->name) == len &&
memcmp(tmp->name, domain, len)==0)
break;
tmp = NULL;
}
return tmp;
}
/* For null terminated domain strings ONLY */
static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
{
assert_spin_locked(&dlm_domain_lock);
return __dlm_lookup_domain_full(domain, strlen(domain));
}
/* returns true on one of two conditions:
* 1) the domain does not exist
* 2) the domain exists and it's state is "joined" */
static int dlm_wait_on_domain_helper(const char *domain)
{
int ret = 0;
struct dlm_ctxt *tmp = NULL;
spin_lock(&dlm_domain_lock);
tmp = __dlm_lookup_domain(domain);
if (!tmp)
ret = 1;
else if (tmp->dlm_state == DLM_CTXT_JOINED)
ret = 1;
spin_unlock(&dlm_domain_lock);
return ret;
}
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
if (dlm->name)
kfree(dlm->name);
kfree(dlm);
}
/* A little strange - this function will be called while holding
* dlm_domain_lock and is expected to be holding it on the way out. We
* will however drop and reacquire it multiple times */
static void dlm_ctxt_release(struct kref *kref)
{
struct dlm_ctxt *dlm;
dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
BUG_ON(dlm->num_joins);
BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
/* we may still be in the list if we hit an error during join. */
list_del_init(&dlm->list);
spin_unlock(&dlm_domain_lock);
mlog(0, "freeing memory from domain %s\n", dlm->name);
wake_up(&dlm_domain_events);
dlm_free_ctxt_mem(dlm);
spin_lock(&dlm_domain_lock);
}
void dlm_put(struct dlm_ctxt *dlm)
{
spin_lock(&dlm_domain_lock);
kref_put(&dlm->dlm_refs, dlm_ctxt_release);
spin_unlock(&dlm_domain_lock);
}
static void __dlm_get(struct dlm_ctxt *dlm)
{
kref_get(&dlm->dlm_refs);
}
/* given a questionable reference to a dlm object, gets a reference if
* it can find it in the list, otherwise returns NULL in which case
* you shouldn't trust your pointer. */
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
{
struct list_head *iter;
struct dlm_ctxt *target = NULL;
spin_lock(&dlm_domain_lock);
list_for_each(iter, &dlm_domains) {
target = list_entry (iter, struct dlm_ctxt, list);
if (target == dlm) {
__dlm_get(target);
break;
}
target = NULL;
}
spin_unlock(&dlm_domain_lock);
return target;
}
int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
{
int ret;
spin_lock(&dlm_domain_lock);
ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
spin_unlock(&dlm_domain_lock);
return ret;
}
static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
{
if (dlm->dlm_worker) {
flush_workqueue(dlm->dlm_worker);
destroy_workqueue(dlm->dlm_worker);
dlm->dlm_worker = NULL;
}
}
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
/* We've left the domain. Now we can take ourselves out of the
* list and allow the kref stuff to help us free the
* memory. */
spin_lock(&dlm_domain_lock);
list_del_init(&dlm->list);
spin_unlock(&dlm_domain_lock);
/* Wake up anyone waiting for us to remove this domain */
wake_up(&dlm_domain_events);
}
static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
int i, num, n, ret = 0;
struct dlm_lock_resource *res;
struct hlist_node *iter;
struct hlist_head *bucket;
int dropped;
mlog(0, "Migrating locks from domain %s\n", dlm->name);
spin_lock(&dlm->spinlock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
redo_bucket:
n = 0;
bucket = dlm_lockres_hash(dlm, i);
iter = bucket->first;
while (iter) {
n++;
res = hlist_entry(iter, struct dlm_lock_resource,
hash_node);
/* migrate, if necessary. this will drop the dlm
* spinlock and retake it if it does migration. */
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
__dlm_lockres_calc_usage(dlm, res);
iter = res->hash_node.next;
spin_unlock(&res->spinlock);
cond_resched_lock(&dlm->spinlock);
if (dropped)
goto redo_bucket;
num += n;
mlog(0, "%s: touched %d lockreses in bucket %d "
"(tot=%d)\n", dlm->name, n, i, num);
}
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
/* let the dlm thread take care of purging, keep scanning until
* nothing remains in the hash */
if (num) {
mlog(0, "%s: %d lock resources in hash last pass\n",
dlm->name, num);
ret = -EAGAIN;
}
mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
}
static int dlm_no_joining_node(struct dlm_ctxt *dlm)
{
int ret;
spin_lock(&dlm->spinlock);
ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
spin_unlock(&dlm->spinlock);
return ret;
}
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
* state and the dlm spinlock for join state... Sorry! */
again:
spin_lock(&dlm_domain_lock);
spin_lock(&dlm->spinlock);
if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "Node %d is joining, we wait on it.\n",
dlm->joining_node);
spin_unlock(&dlm->spinlock);
spin_unlock(&dlm_domain_lock);
wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
goto again;
}
dlm->dlm_state = DLM_CTXT_LEAVING;
spin_unlock(&dlm->spinlock);
spin_unlock(&dlm_domain_lock);
}
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
int node = -1;
assert_spin_locked(&dlm->spinlock);
printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
mlog_entry("%p %u %p", msg, len, data);
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
dlm_hb_event_notify_attached(dlm, node, 0);
spin_unlock(&dlm->spinlock);
dlm_put(dlm);
return 0;
}
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
node, dlm->name, dlm->node_num);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
&leave_msg, sizeof(leave_msg), node,
NULL);
mlog(0, "status return %d from o2net_send_message\n", status);
return status;
}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
int node, clear_node, status;
/* At this point we've migrated away all our locks and won't
* accept mastership of new ones. The dlm is responsible for
* almost nothing now. We make sure not to confuse any joining
* nodes and then commence shutdown procedure. */
spin_lock(&dlm->spinlock);
/* Clear ourselves from the domain map */
clear_bit(dlm->node_num, dlm->domain_map);
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
0)) < O2NM_MAX_NODES) {
/* Drop the dlm spinlock. This is safe wrt the domain_map.
* -nodes cannot be added now as the
* query_join_handlers knows to respond with OK_NO_MAP
* -we catch the right network errors if a node is
* removed from the map while we're sending him the
* exit message. */
spin_unlock(&dlm->spinlock);
clear_node = 1;
status = dlm_send_one_domain_exit(dlm, node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
mlog(ML_NOTICE, "Error %d sending domain exit message "
"to node %d\n", status, node);
/* Not sure what to do here but lets sleep for
* a bit in case this was a transient
* error... */
msleep(DLM_DOMAIN_BACKOFF_MS);
clear_node = 0;
}
spin_lock(&dlm->spinlock);
/* If we're not clearing the node bit then we intend
* to loop back around to try again. */
if (clear_node)
clear_bit(node, dlm->domain_map);
}
spin_unlock(&dlm->spinlock);
}
int dlm_joined(struct dlm_ctxt *dlm)
{
int ret = 0;
spin_lock(&dlm_domain_lock);
if (dlm->dlm_state == DLM_CTXT_JOINED)
ret = 1;
spin_unlock(&dlm_domain_lock);
return ret;
}
int dlm_shutting_down(struct dlm_ctxt *dlm)
{
int ret = 0;
spin_lock(&dlm_domain_lock);
if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
ret = 1;
spin_unlock(&dlm_domain_lock);
return ret;
}
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
spin_lock(&dlm_domain_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
BUG_ON(!dlm->num_joins);
dlm->num_joins--;
if (!dlm->num_joins) {
/* We mark it "in shutdown" now so new register
* requests wait until we've completely left the
* domain. Don't use DLM_CTXT_LEAVING yet as we still
* want new domain joins to communicate with us at
* least until we've completed migration of our
* resources. */
dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
leave = 1;
}
spin_unlock(&dlm_domain_lock);
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
while (dlm_migrate_all_locks(dlm)) {
mlog(0, "%s: more migration to do\n", dlm->name);
}
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
dlm_complete_dlm_shutdown(dlm);
}
dlm_put(dlm);
}
EXPORT_SYMBOL_GPL(dlm_unregister_domain);
static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
{
struct dlm_query_join_request *query;
enum dlm_query_join_response response;
struct dlm_ctxt *dlm = NULL;
query = (struct dlm_query_join_request *) msg->buf;
mlog(0, "node %u wants to join domain %s\n", query->node_idx,
query->domain);
/*
* If heartbeat doesn't consider the node live, tell it
* to back off and try again. This gives heartbeat a chance
* to catch up.
*/
if (!o2hb_check_node_heartbeating(query->node_idx)) {
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
response = JOIN_DISALLOW;
goto respond;
}
response = JOIN_OK_NO_MAP;
spin_lock(&dlm_domain_lock);
dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
/* Once the dlm ctxt is marked as leaving then we don't want
* to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
* times (ie. during recovery). */
if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
spin_lock(&dlm->spinlock);
if (dlm->dlm_state == DLM_CTXT_NEW &&
dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
/*If this is a brand new context and we
* haven't started our join process yet, then
* the other node won the race. */
response = JOIN_OK_NO_MAP;
} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
/* Disallow parallel joins. */
response = JOIN_DISALLOW;
} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
mlog(ML_NOTICE, "node %u trying to join, but recovery "
"is ongoing.\n", bit);
response = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->recovery_map)) {
mlog(ML_NOTICE, "node %u trying to join, but it "
"still needs recovery.\n", bit);
response = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->domain_map)) {
mlog(ML_NOTICE, "node %u trying to join, but it "
"is still in the domain! needs recovery?\n",
bit);
response = JOIN_DISALLOW;
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
} else {
/* Alright we're fully a part of this domain
* so we keep some state as to who's joining
* and indicate to him that needs to be fixed
* up. */
response = JOIN_OK;
__dlm_set_joining_node(dlm, query->node_idx);
}
spin_unlock(&dlm->spinlock);
}
spin_unlock(&dlm_domain_lock);
respond:
mlog(0, "We respond with %u\n", response);
return response;
}
static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
{
struct dlm_assert_joined *assert;
struct dlm_ctxt *dlm = NULL;
assert = (struct dlm_assert_joined *) msg->buf;
mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
assert->domain);
spin_lock(&dlm_domain_lock);
dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
/* XXX should we consider no dlm ctxt an error? */
if (dlm) {
spin_lock(&dlm->spinlock);
/* Alright, this node has officially joined our
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
set_bit(assert->node_idx, dlm->domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
assert->node_idx, dlm->name);
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
spin_unlock(&dlm->spinlock);
}
spin_unlock(&dlm_domain_lock);
return 0;
}
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
{
struct dlm_cancel_join *cancel;
struct dlm_ctxt *dlm = NULL;
cancel = (struct dlm_cancel_join *) msg->buf;
mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
cancel->domain);
spin_lock(&dlm_domain_lock);
dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
if (dlm) {
spin_lock(&dlm->spinlock);
/* Yikes, this guy wants to cancel his join. No
* problem, we simply cleanup our join state. */
BUG_ON(dlm->joining_node != cancel->node_idx);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
spin_unlock(&dlm->spinlock);
}
spin_unlock(&dlm_domain_lock);
return 0;
}
static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
struct dlm_cancel_join cancel_msg;
memset(&cancel_msg, 0, sizeof(cancel_msg));
cancel_msg.node_idx = dlm->node_num;
cancel_msg.name_len = strlen(dlm->name);
memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
&cancel_msg, sizeof(cancel_msg), node,
NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
return status;
}
/* map_size should be in bytes. */
static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
unsigned long *node_map,
unsigned int map_size)
{
int status, tmpstat;
unsigned int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
mlog(ML_ERROR,
"map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
return -EINVAL;
}
status = 0;
node = -1;
while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
if (node == dlm->node_num)
continue;
tmpstat = dlm_send_one_join_cancel(dlm, node);
if (tmpstat) {
mlog(ML_ERROR, "Error return %d cancelling join on "
"node %d\n", tmpstat, node);
if (!status)
status = tmpstat;
}
}
if (status)
mlog_errno(status);
return status;
}
static int dlm_request_join(struct dlm_ctxt *dlm,
int node,
enum dlm_query_join_response *response)
{
int status, retval;
struct dlm_query_join_request join_msg;
mlog(0, "querying node %d\n", node);
memset(&join_msg, 0, sizeof(join_msg));
join_msg.node_idx = dlm->node_num;
join_msg.name_len = strlen(dlm->name);
memcpy(join_msg.domain, dlm->name, join_msg.name_len);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
sizeof(join_msg), node, &retval);
if (status < 0 && status != -ENOPROTOOPT) {
mlog_errno(status);
goto bail;
}
/* -ENOPROTOOPT from the net code means the other side isn't
listening for our message type -- that's fine, it means
his dlm isn't up, so we can consider him a 'yes' but not
joined into the domain. */
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
} else if (retval == JOIN_DISALLOW ||
retval == JOIN_OK ||
retval == JOIN_OK_NO_MAP) {
*response = retval;
} else {
status = -EINVAL;
mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
node);
}
mlog(0, "status %d, node %d response is %d\n", status, node,
*response);
bail:
return status;
}
static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
memset(&assert_msg, 0, sizeof(assert_msg));
assert_msg.node_idx = dlm->node_num;
assert_msg.name_len = strlen(dlm->name);
memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
NULL);
if (status < 0)
mlog_errno(status);
return status;
}
static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
unsigned long *node_map)
{
int status, node, live;
status = 0;
node = -1;
while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
if (node == dlm->node_num)
continue;
do {
/* It is very important that this message be
* received so we spin until either the node
* has died or it gets the message. */
status = dlm_send_one_join_assert(dlm, node);
spin_lock(&dlm->spinlock);
live = test_bit(node, dlm->live_nodes_map);
spin_unlock(&dlm->spinlock);
if (status) {
mlog(ML_ERROR, "Error return %d asserting "
"join on node %d\n", status, node);
/* give us some time between errors... */
if (live)
msleep(DLM_DOMAIN_BACKOFF_MS);
}
} while (status && live);
}
}
struct domain_join_ctxt {
unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
static int dlm_should_restart_join(struct dlm_ctxt *dlm,
struct domain_join_ctxt *ctxt,
enum dlm_query_join_response response)
{
int ret;
if (response == JOIN_DISALLOW) {
mlog(0, "Latest response of disallow -- should restart\n");
return 1;
}
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
sizeof(dlm->live_nodes_map));
spin_unlock(&dlm->spinlock);
if (ret)
mlog(0, "Node maps changed -- should restart\n");
return ret;
}
static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
{
int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
enum dlm_query_join_response response;
mlog_entry("%p", dlm);
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
if (!ctxt) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
spin_lock(&dlm->spinlock);
memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
__dlm_set_joining_node(dlm, dlm->node_num);
spin_unlock(&dlm->spinlock);
node = -1;
while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
if (node == dlm->node_num)
continue;
status = dlm_request_join(dlm, node, &response);
if (status < 0) {
mlog_errno(status);