2 * tree234.c: reasonably generic 2-3-4 tree routines. Currently
3 * supports insert, delete, find and iterate operations.
11 #define mknew(typ) ( (typ *) malloc (sizeof (typ)) )
15 #define LOG(x) (printf x)
32 * Create a 2-3-4 tree.
34 tree234 *newtree234(cmpfn234 cmp) {
35 tree234 *ret = mknew(tree234);
36 LOG(("created tree %p\n", ret));
43 * Free a 2-3-4 tree (not including freeing the elements).
45 static void freenode234(node234 *n) {
48 freenode234(n->kids[0]);
49 freenode234(n->kids[1]);
50 freenode234(n->kids[2]);
51 freenode234(n->kids[3]);
54 void freetree234(tree234 *t) {
60 * Add an element e to a 2-3-4 tree t. Returns e on success, or if
61 * an existing element compares equal, returns that.
63 void *add234(tree234 *t, void *e) {
64 node234 *n, **np, *left, *right;
68 LOG(("adding node %p to tree %p\n", e, t));
69 if (t->root == NULL) {
70 t->root = mknew(node234);
71 t->root->elems[1] = t->root->elems[2] = NULL;
72 t->root->kids[0] = t->root->kids[1] = NULL;
73 t->root->kids[2] = t->root->kids[3] = NULL;
74 t->root->parent = NULL;
75 t->root->elems[0] = e;
76 LOG((" created root %p\n", t->root));
83 LOG((" node %p: %p [%p] %p [%p] %p [%p] %p\n",
84 n, n->kids[0], n->elems[0], n->kids[1], n->elems[1],
85 n->kids[2], n->elems[2], n->kids[3]));
86 if ((c = t->cmp(e, n->elems[0])) < 0)
89 return n->elems[0]; /* already exists */
90 else if (n->elems[1] == NULL || (c = t->cmp(e, n->elems[1])) < 0)
93 return n->elems[1]; /* already exists */
94 else if (n->elems[2] == NULL || (c = t->cmp(e, n->elems[2])) < 0)
97 return n->elems[2]; /* already exists */
100 LOG((" moving to child %d (%p)\n", np - n->kids, *np));
104 * We need to insert the new element in n at position np.
109 LOG((" at %p: %p [%p] %p [%p] %p [%p] %p\n",
110 n, n->kids[0], n->elems[0], n->kids[1], n->elems[1],
111 n->kids[2], n->elems[2], n->kids[3]));
112 LOG((" need to insert %p [%p] %p at position %d\n",
113 left, e, right, np - n->kids));
114 if (n->elems[1] == NULL) {
116 * Insert in a 2-node; simple.
118 if (np == &n->kids[0]) {
119 LOG((" inserting on left of 2-node\n"));
120 n->kids[2] = n->kids[1];
121 n->elems[1] = n->elems[0];
125 } else { /* np == &n->kids[1] */
126 LOG((" inserting on right of 2-node\n"));
131 if (n->kids[0]) n->kids[0]->parent = n;
132 if (n->kids[1]) n->kids[1]->parent = n;
133 if (n->kids[2]) n->kids[2]->parent = n;
136 } else if (n->elems[2] == NULL) {
138 * Insert in a 3-node; simple.
140 if (np == &n->kids[0]) {
141 LOG((" inserting on left of 3-node\n"));
142 n->kids[3] = n->kids[2];
143 n->elems[2] = n->elems[1];
144 n->kids[2] = n->kids[1];
145 n->elems[1] = n->elems[0];
149 } else if (np == &n->kids[1]) {
150 LOG((" inserting in middle of 3-node\n"));
151 n->kids[3] = n->kids[2];
152 n->elems[2] = n->elems[1];
156 } else { /* np == &n->kids[2] */
157 LOG((" inserting on right of 3-node\n"));
162 if (n->kids[0]) n->kids[0]->parent = n;
163 if (n->kids[1]) n->kids[1]->parent = n;
164 if (n->kids[2]) n->kids[2]->parent = n;
165 if (n->kids[3]) n->kids[3]->parent = n;
169 node234 *m = mknew(node234);
170 m->parent = n->parent;
171 LOG((" splitting a 4-node; created new node %p\n", m));
173 * Insert in a 4-node; split into a 2-node and a
174 * 3-node, and move focus up a level.
176 * I don't think it matters which way round we put the
177 * 2 and the 3. For simplicity, we'll put the 3 first
180 if (np == &n->kids[0]) {
184 m->elems[1] = n->elems[0];
185 m->kids[2] = n->kids[1];
187 n->kids[0] = n->kids[2];
188 n->elems[0] = n->elems[2];
189 n->kids[1] = n->kids[3];
190 } else if (np == &n->kids[1]) {
191 m->kids[0] = n->kids[0];
192 m->elems[0] = n->elems[0];
197 n->kids[0] = n->kids[2];
198 n->elems[0] = n->elems[2];
199 n->kids[1] = n->kids[3];
200 } else if (np == &n->kids[2]) {
201 m->kids[0] = n->kids[0];
202 m->elems[0] = n->elems[0];
203 m->kids[1] = n->kids[1];
204 m->elems[1] = n->elems[1];
208 n->elems[0] = n->elems[2];
209 n->kids[1] = n->kids[3];
210 } else { /* np == &n->kids[3] */
211 m->kids[0] = n->kids[0];
212 m->elems[0] = n->elems[0];
213 m->kids[1] = n->kids[1];
214 m->elems[1] = n->elems[1];
215 m->kids[2] = n->kids[2];
221 m->kids[3] = n->kids[3] = n->kids[2] = NULL;
222 m->elems[2] = n->elems[2] = n->elems[1] = NULL;
223 if (m->kids[0]) m->kids[0]->parent = m;
224 if (m->kids[1]) m->kids[1]->parent = m;
225 if (m->kids[2]) m->kids[2]->parent = m;
226 if (n->kids[0]) n->kids[0]->parent = n;
227 if (n->kids[1]) n->kids[1]->parent = n;
228 LOG((" left (%p): %p [%p] %p [%p] %p\n", m,
229 m->kids[0], m->elems[0],
230 m->kids[1], m->elems[1],
232 LOG((" right (%p): %p [%p] %p\n", n,
233 n->kids[0], n->elems[0],
239 np = (n->parent->kids[0] == n ? &n->parent->kids[0] :
240 n->parent->kids[1] == n ? &n->parent->kids[1] :
241 n->parent->kids[2] == n ? &n->parent->kids[2] :
242 &n->parent->kids[3]);
247 * If we've come out of here by `break', n will still be
248 * non-NULL and we've finished. If we've come here because n is
249 * NULL, we need to create a new root for the tree because the
250 * old one has just split into two.
253 LOG((" root is overloaded, split into two\n"));
254 t->root = mknew(node234);
255 t->root->kids[0] = left;
256 t->root->elems[0] = e;
257 t->root->kids[1] = right;
258 t->root->elems[1] = NULL;
259 t->root->kids[2] = NULL;
260 t->root->elems[2] = NULL;
261 t->root->kids[3] = NULL;
262 t->root->parent = NULL;
263 if (t->root->kids[0]) t->root->kids[0]->parent = t->root;
264 if (t->root->kids[1]) t->root->kids[1]->parent = t->root;
265 LOG((" new root is %p [%p] %p\n",
266 t->root->kids[0], t->root->elems[0], t->root->kids[1]));
273 * Find an element e in a 2-3-4 tree t. Returns NULL if not found.
274 * e is always passed as the first argument to cmp, so cmp can be
275 * an asymmetric function if desired. cmp can also be passed as
276 * NULL, in which case the compare function from the tree proper
279 void *find234(tree234 *t, void *e, cmpfn234 cmp) {
291 if ( (c = cmp(e, n->elems[0])) < 0)
295 else if (n->elems[1] == NULL || (c = cmp(e, n->elems[1])) < 0)
299 else if (n->elems[2] == NULL || (c = cmp(e, n->elems[2])) < 0)
308 * We've found our way to the bottom of the tree and we know
309 * where we would insert this node if we wanted to. But it
316 * Delete an element e in a 2-3-4 tree. Does not free the element,
317 * merely removes all links to it from the tree nodes.
319 void del234(tree234 *t, void *e) {
324 LOG(("deleting %p from tree %p\n", e, t));
331 LOG((" node %p: %p [%p] %p [%p] %p [%p] %p\n",
332 n, n->kids[0], n->elems[0], n->kids[1], n->elems[1],
333 n->kids[2], n->elems[2], n->kids[3]));
334 if ((c = t->cmp(e, n->elems[0])) < 0) {
338 } else if (n->elems[1] == NULL || (c = t->cmp(e, n->elems[1])) < 0) {
342 } else if (n->elems[2] == NULL || (c = t->cmp(e, n->elems[2])) < 0) {
350 * Recurse down to subtree ki. If it has only one element,
351 * we have to do some transformation to start with.
353 LOG((" moving to subtree %d\n", ki));
355 if (!sub->elems[1]) {
356 LOG((" subtree has only one element!\n", ki));
357 if (ki > 0 && n->kids[ki-1]->elems[1]) {
359 * Case 3a, left-handed variant. Child ki has
360 * only one element, but child ki-1 has two or
361 * more. So we need to move a subtree from ki-1
366 * [more] a A b B c d D e [more] a A b c C d D e
368 node234 *sib = n->kids[ki-1];
369 int lastelem = (sib->elems[2] ? 2 :
370 sib->elems[1] ? 1 : 0);
371 sub->kids[2] = sub->kids[1];
372 sub->elems[1] = sub->elems[0];
373 sub->kids[1] = sub->kids[0];
374 sub->elems[0] = n->elems[ki-1];
375 sub->kids[0] = sib->kids[lastelem+1];
376 if (sub->kids[0]) sub->kids[0]->parent = sub;
377 n->elems[ki-1] = sib->elems[lastelem];
378 sib->kids[lastelem+1] = NULL;
379 sib->elems[lastelem] = NULL;
380 LOG((" case 3a left\n"));
381 } else if (ki < 3 && n->kids[ki+1] &&
382 n->kids[ki+1]->elems[1]) {
384 * Case 3a, right-handed variant. ki has only
385 * one element but ki+1 has two or more. Move a
386 * subtree from ki+1 to ki.
390 * a A b c C d D e [more] a A b B c d D e [more]
392 node234 *sib = n->kids[ki+1];
394 sub->elems[1] = n->elems[ki];
395 sub->kids[2] = sib->kids[0];
396 if (sub->kids[2]) sub->kids[2]->parent = sub;
397 n->elems[ki] = sib->elems[0];
398 sib->kids[0] = sib->kids[1];
399 for (j = 0; j < 2 && sib->elems[j+1]; j++) {
400 sib->kids[j+1] = sib->kids[j+2];
401 sib->elems[j] = sib->elems[j+1];
403 sib->kids[j+1] = NULL;
404 sib->elems[j] = NULL;
405 LOG((" case 3a right\n"));
408 * Case 3b. ki has only one element, and has no
409 * neighbour with more than one. So pick a
410 * neighbour and merge it with ki, taking an
411 * element down from n to go in the middle.
415 * a A b c C d a A b B c C d
417 * (Since at all points we have avoided
418 * descending to a node with only one element,
419 * we can be sure that n is not reduced to
420 * nothingness by this move, _unless_ it was
421 * the very first node, ie the root of the
422 * tree. In that case we remove the now-empty
423 * root and replace it with its single large
434 sub->kids[3] = sub->kids[1];
435 sub->elems[2] = sub->elems[0];
436 sub->kids[2] = sub->kids[0];
437 sub->elems[1] = n->elems[ki];
438 sub->kids[1] = sib->kids[1];
439 if (sub->kids[1]) sub->kids[1]->parent = sub;
440 sub->elems[0] = sib->elems[0];
441 sub->kids[0] = sib->kids[0];
442 if (sub->kids[0]) sub->kids[0]->parent = sub;
447 * That's built the big node in sub. Now we
448 * need to remove the reference to sib in n.
450 for (j = ki; j < 3 && n->kids[j+1]; j++) {
451 n->kids[j] = n->kids[j+1];
452 n->elems[j] = j<2 ? n->elems[j+1] : NULL;
455 if (j < 3) n->elems[j] = NULL;
456 LOG((" case 3b ki=%d\n", ki));
460 * The root is empty and needs to be
463 LOG((" shifting root!\n"));
473 return; /* nothing to do; `already removed' */
476 * Treat special case: this is the one remaining item in
477 * the tree. n is the tree root (no parent), has one
478 * element (no elems[1]), and has no kids (no kids[0]).
480 if (!n->parent && !n->elems[1] && !n->kids[0]) {
481 LOG((" removed last element in tree\n"));
488 * Now we have the element we want, as n->elems[ei], and we
489 * have also arranged for that element not to be the only
490 * one in its node. So...
493 if (!n->kids[0] && n->elems[1]) {
495 * Case 1. n is a leaf node with more than one element,
496 * so it's _really easy_. Just delete the thing and
501 for (i = ei; i < 2 && n->elems[i+1]; i++)
502 n->elems[i] = n->elems[i+1];
504 return; /* finished! */
505 } else if (n->kids[ei]->elems[1]) {
507 * Case 2a. n is an internal node, and the root of the
508 * subtree to the left of e has more than one element.
509 * So find the predecessor p to e (ie the largest node
510 * in that subtree), place it where e currently is, and
511 * then start the deletion process over again on the
512 * subtree with p as target.
514 node234 *m = n->kids[ei];
518 m = (m->kids[3] ? m->kids[3] :
519 m->kids[2] ? m->kids[2] :
520 m->kids[1] ? m->kids[1] : m->kids[0]);
522 target = (m->elems[2] ? m->elems[2] :
523 m->elems[1] ? m->elems[1] : m->elems[0]);
524 n->elems[ei] = target;
527 } else if (n->kids[ei+1]->elems[1]) {
529 * Case 2b, symmetric to 2a but s/left/right/ and
530 * s/predecessor/successor/. (And s/largest/smallest/).
532 node234 *m = n->kids[ei+1];
538 target = m->elems[0];
539 n->elems[ei] = target;
544 * Case 2c. n is an internal node, and the subtrees to
545 * the left and right of e both have only one element.
546 * So combine the two subnodes into a single big node
547 * with their own elements on the left and right and e
548 * in the middle, then restart the deletion process on
549 * that subtree, with e still as target.
551 node234 *a = n->kids[ei], *b = n->kids[ei+1];
555 a->elems[1] = n->elems[ei];
556 a->kids[2] = b->kids[0];
557 if (a->kids[2]) a->kids[2]->parent = a;
558 a->elems[2] = b->elems[0];
559 a->kids[3] = b->kids[1];
560 if (a->kids[3]) a->kids[3]->parent = a;
563 * That's built the big node in a, and destroyed b. Now
564 * remove the reference to b (and e) in n.
566 for (j = ei; j < 2 && n->elems[j+1]; j++) {
567 n->elems[j] = n->elems[j+1];
568 n->kids[j+1] = n->kids[j+2];
573 * It's possible, in this case, that we've just removed
574 * the only element in the root of the tree. If so,
577 if (n->elems[0] == NULL) {
578 LOG((" shifting root!\n"));
584 * Now go round the deletion process again, with n
585 * pointing at the new big node and e still the same.
593 * Iterate over the elements of a tree234, in order.
595 void *first234(tree234 *t, enum234 *e) {
596 node234 *n = t->root;
606 void *next234(enum234 *e) {
607 node234 *n = e->node;
610 if (n->kids[pos+1]) {
619 if (pos < 2 && n->elems[pos+1]) {
621 return n->elems[e->posn];
625 node234 *nn = n->parent;
627 return NULL; /* end of tree */
628 pos = (nn->kids[0] == n ? 0 :
629 nn->kids[1] == n ? 1 :
630 nn->kids[2] == n ? 2 : 3);
632 } while (pos == 3 || n->kids[pos+1] == NULL);
636 return n->elems[pos];
642 * Test code for the 2-3-4 tree. This code maintains an alternative
643 * representation of the data in the tree, in an array (using the
644 * obvious and slow insert and delete functions). After each tree
645 * operation, the verify() function is called, which ensures all
646 * the tree properties are preserved (node->child->parent always
647 * equals node; number of kids == 0 or number of elements + 1;
648 * ordering property between elements of a node and elements of its
649 * children is preserved; tree has the same depth everywhere; every
650 * node has at least one element) and also ensures the list
651 * represented by the tree is the same list it should be. (This
652 * last check also verifies the ordering properties, because the
653 * `same list it should be' is by definition correctly ordered. It
654 * also ensures all nodes are distinct, because the enum functions
655 * would get caught in a loop if not.)
661 * Error reporting function.
663 void error(char *fmt, ...) {
667 vfprintf(stdout, fmt, ap);
672 /* The array representation of the data. */
674 int arraylen, arraysize;
677 /* The tree representation of the same data. */
685 void chknode(chkctx *ctx, int level, node234 *node,
686 void *lowbound, void *highbound) {
690 /* Count the non-NULL kids. */
691 for (nkids = 0; nkids < 4 && node->kids[nkids]; nkids++);
692 /* Ensure no kids beyond the first NULL are non-NULL. */
693 for (i = nkids; i < 4; i++)
695 error("node %p: nkids=%d but kids[%d] non-NULL",
699 /* Count the non-NULL elements. */
700 for (nelems = 0; nelems < 3 && node->elems[nelems]; nelems++);
701 /* Ensure no elements beyond the first NULL are non-NULL. */
702 for (i = nelems; i < 3; i++)
703 if (node->elems[i]) {
704 error("node %p: nelems=%d but elems[%d] non-NULL",
710 * If nkids==0, this is a leaf node; verify that the tree
711 * depth is the same everywhere.
713 if (ctx->treedepth < 0)
714 ctx->treedepth = level; /* we didn't know the depth yet */
715 else if (ctx->treedepth != level)
716 error("node %p: leaf at depth %d, previously seen depth %d",
717 node, level, ctx->treedepth);
720 * If nkids != 0, then it should be nelems+1, unless nelems
721 * is 0 in which case nkids should also be 0 (and so we
722 * shouldn't be in this condition at all).
724 int shouldkids = (nelems ? nelems+1 : 0);
725 if (nkids != shouldkids) {
726 error("node %p: %d elems should mean %d kids but has %d",
727 node, nelems, shouldkids, nkids);
732 * nelems should be at least 1.
735 error("node %p: no elems", node, nkids);
739 * Add nelems to the running element count of the whole tree
740 * (to ensure the enum234 routines see them all).
742 ctx->elemcount += nelems;
745 * Check ordering property: all elements should be strictly >
746 * lowbound, strictly < highbound, and strictly < each other in
747 * sequence. (lowbound and highbound are NULL at edges of tree
748 * - both NULL at root node - and NULL is considered to be <
749 * everything and > everything. IYSWIM.)
751 for (i = -1; i < nelems; i++) {
752 void *lower = (i == -1 ? lowbound : node->elems[i]);
753 void *higher = (i+1 == nelems ? highbound : node->elems[i+1]);
754 if (lower && higher && cmp(lower, higher) >= 0) {
755 error("node %p: kid comparison [%d=%s,%d=%s] failed",
756 node, i, lower, i+1, higher);
761 * Check parent pointers: all non-NULL kids should have a
762 * parent pointer coming back to this node.
764 for (i = 0; i < nkids; i++)
765 if (node->kids[i]->parent != node) {
766 error("node %p kid %d: parent ptr is %p not %p",
767 node, i, node->kids[i]->parent, node);
772 * Now (finally!) recurse into subtrees.
774 for (i = 0; i < nkids; i++) {
775 void *lower = (i == 0 ? lowbound : node->elems[i-1]);
776 void *higher = (i >= nelems ? highbound : node->elems[i]);
777 chknode(ctx, level+1, node->kids[i], lower, higher);
787 ctx.treedepth = -1; /* depth unknown yet */
788 ctx.elemcount = 0; /* no elements seen yet */
790 * Verify validity of tree properties.
793 chknode(&ctx, 0, tree->root, NULL, NULL);
794 printf("tree depth: %d\n", ctx.treedepth);
796 * Enumerate the tree and ensure it matches up to the array.
798 for (i = 0, p = first234(tree, &e);
800 i++, p = next234(&e)) {
802 error("tree contains more than %d elements", arraylen);
804 error("enum at position %d: array says %s, tree says %s",
807 if (i != ctx.elemcount) {
808 error("tree really contains %d elements, enum gave %d",
812 error("enum gave only %d elements, array has %d", i, arraylen);
816 void addtest(void *elem) {
818 void *retval, *realret;
820 if (arraysize < arraylen+1) {
821 arraysize = arraylen+1+256;
822 array = (array == NULL ? malloc(arraysize*sizeof(*array)) :
823 realloc(array, arraysize*sizeof(*array)));
827 while (i < arraylen && cmp(elem, array[i]) > 0)
829 /* now i points to the first element >= elem */
830 if (i < arraylen && !cmp(elem, array[i]))
831 retval = array[i]; /* expect that returned not elem */
833 retval = elem; /* expect elem returned (success) */
834 for (j = arraylen; j > i; j--)
835 array[j] = array[j-1];
836 array[i] = elem; /* add elem to array */
840 realret = add234(tree, elem);
841 if (realret != retval) {
842 error("add: retval was %p expected %p", realret, retval);
848 void deltest(void *elem) {
852 while (i < arraylen && cmp(elem, array[i]) > 0)
854 /* now i points to the first element >= elem */
855 if (i >= arraylen || cmp(elem, array[i]) != 0)
856 return; /* don't do it! */
858 while (i < arraylen-1) {
859 array[i] = array[i+1];
862 arraylen--; /* delete elem from array */
870 /* A sample data set and test utility. Designed for pseudo-randomness,
871 * and yet repeatability. */
874 * This random number generator uses the `portable implementation'
875 * given in ANSI C99 draft N869. It assumes `unsigned' is 32 bits;
878 int randomnumber(unsigned *seed) {
881 return ((*seed) / 65536) % 32768;
884 int mycmp(void *av, void *bv) {
885 char const *a = (char const *)av;
886 char const *b = (char const *)bv;
890 #define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
893 "a", "ab", "absque", "coram", "de",
894 "palam", "clam", "cum", "ex", "e",
895 "sine", "tenus", "pro", "prae",
896 "banana", "carrot", "cabbage", "broccoli", "onion", "zebra",
897 "penguin", "blancmange", "pangolin", "whale", "hedgehog",
898 "giraffe", "peanut", "bungee", "foo", "bar", "baz", "quux",
899 "murfl", "spoo", "breen", "flarn", "octothorpe",
900 "snail", "tiger", "elephant", "octopus", "warthog", "armadillo",
901 "aardvark", "wyvern", "dragon", "elf", "dwarf", "orc", "goblin",
902 "pixie", "basilisk", "warg", "ape", "lizard", "newt", "shopkeeper",
903 "wand", "ring", "amulet"
906 #define NSTR lenof(strings)
913 for (i = 0; i < NSTR; i++) in[i] = 0;
915 arraylen = arraysize = 0;
916 tree = newtree234(mycmp);
920 for (i = 0; i < 10000; i++) {
921 j = randomnumber(&seed);
923 printf("trial: %d\n", i);
925 printf("deleting %s (%d)\n", strings[j], j);
929 printf("adding %s (%d)\n", strings[j], j);
935 while (arraylen > 0) {
936 j = randomnumber(&seed);