sourcemod/core/sm_trie.cpp
David Anderson ffd308f42c Initial import of admin system (groups and overrides done)
Added a "clear" function for Tries

--HG--
extra : convert_revision : svn%3A39bc706e-5318-0410-9160-8a85361fbb7c/trunk%40340
2007-01-24 21:51:49 +00:00

808 lines
24 KiB
C++

#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "sm_trie.h"
/**
* Double Array Trie algorithm, based on:
* An Efficient Implementation of Trie Structures, by
* Jun-ichi Aoe and Katsushi Maromoto, and Takashi Sato
* from Sofiware - Practice and Experience, Vol. 22(9), 695-721 (September 1992)
*
* A Trie is a simple data structure which stores strings as DFAs, with each
* transition state being a string entry. For example, observe the following strings:
*
* BAILOPAN, BAT, BACON, BACK
* These transition as the follow production rules:
* B -> ... B
* A -> ... BA
* I -> ... BAI
* LOPAN BAILOPAN
* T -> ... BAT
* C -> BAC
* O -> ... BACO
* N BACON
* K BACK
*
* The standard implementation for this - using lists - gives a slow linear lookup, somewhere between
* O(N+M) or O(log n). A faster implementation is proposed in the paper above, which is based on compacting
* the transition states into two arrays. In the paper's implementation, two arrays are used, and thus it is
* called the "Double Array" algorithm. However, the CHECK array's size is maintained the same as BASE,
* so they can be combined into one structure. The array seems complex at first, but is very simple: it is a
* tree structure flattened out into a single vector. I am calling this implementation the Flat Array Trie.
*
* BASE[] is an array where each member is a node in the Trie. The node can either be UNUSED (empty), an ARC
* (containing an offset to the next set of ARCs), or a TERMINATOR (contains the rest of a string).
* Each node has an index which must be interpeted based on the node type. If the node is a TERMINATOR, then the
* index is an index into a string table, to find the rest of the string.
* If the node is an ARC, the index is another index into BASE. For each possible token that can follow the
* current token, the value of those tokens can be added to the index given in the ARC. Thus, given a current
* position and the next desired token, the current arc will jump to another arc which can contain either:
* 1) An invalid production (collision, no entry exists)
* 2) An empty production (no entry exists)
* 3) Another arc label (the string ends here or continues into more productions)
* 4) A TERMINATOR (the string ends here and contains an unused set of productions)
*
* So, given current offset N (starting at N=1), jumping to token C means the next offset will be:
* offs = BASE[n] + C
* Thus, the next node will be at:
* BASE[BASE[n] + C]
*
* This allows each ARC to specify the base offset for any of its ARC children, like a tree. Each node specifies
* its parent ARC -- so if an invalid offset is specified, the parent will not match, and thus no such derived
* string exists.
*
* This means that arrays can be laid out "sparsely," maximizing their usage. Note that N need not be related to
* the range of tokens (1-256). I.e., a base index does not have to be at 1, 256, 512, et cetera. This is because
* insertion comes with a small deal of complexity. To insert a new set of tokens T, the algorithm finds a new
* BASE index N such that BASE[N+T[i]] is unused for each T[i]. Thus, indirection is not necessarily linear;
* traversing a chain of ARC nodes can _and will_ jump around BASE.
*
* Of course, given this level of flexibility in the array organization, there are collisions. This is largely
* where insertions become slow, as the old chain must be relocated before the new one is used. Relocation means
* finding one or more new base indexes, and this means traversing BASE until an acceptable index is found, such
* that each offset is unused (see description in previous paragraph).
*
* However, it is not insertion time we are concerned about. The "trie" name comes from reTRIEval. We are only
* concerned with lookup and deletion. Both lookup and deletion are O(k), where k is relative to the length of the
* input string. Note that it is best case O(1) and worst case O(k). Deleting the entire trie is always O(1).
*/
/**
* Optimization ideas for the future:
* 1) Store a reference count for each arc, with the count of sub-children.
* This could let us break out of "children searches" for case 4 easily.
* 2) Use a 'free list' so we can easily search the trie for free points.
* This would drastically speed up x_check*
*/
enum NodeType
{
Node_Unused = 0, /* Node is not being used (sparse) */
Node_Arc, /* Node is part of an arc and does not terminate */
Node_Term, /* Node is a terminator */
};
struct TrieNode
{
/**
* For Node_Arc, this index stores the 'base' offset to the next arc chain.
* I.e. to jump from this arc to character C, it will be at base[idx+C].
* For Node_Term, this is an index into the string table.
*/
unsigned int idx;
/**
* This contains the prior arc that we must have come from.
* For example, if arc 63 has a base jump of index 12, and we want to see if
* there is a valid character C, the parent of 12+C must be 63.
*/
unsigned int parent;
void *value; /* Value associated with this node */
NodeType mode; /* Current usage type of the node */
bool valset; /* Whether or not a value is set */
};
struct Trie
{
TrieNode *base; /* Base array for the sparse tables */
char *stringtab; /* String table pointer */
unsigned int baseSize; /* Size of the base array, in members */
unsigned int stSize; /* Size of the string table, in bytes */
unsigned int tail; /* Current unused offset into the string table */
};
inline unsigned char charval(char c)
{
return (unsigned char)c;
}
bool sm_trie_grow(Trie *trie)
{
/* The current # of nodes in the tree is trie->baseSize + 1 */
unsigned int curSize = trie->baseSize;
unsigned int newSize = curSize * 2;
//:TODO: Make functions calling this return failure if this fails
trie->base = (TrieNode *)realloc(trie->base, (newSize + 1) * sizeof(TrieNode));
if (!trie->base)
{
return false;
}
memset(&trie->base[curSize+1], 0, (newSize - curSize) * sizeof(TrieNode));
trie->baseSize = newSize;
return true;
}
unsigned int x_check(Trie *trie, char c, unsigned int start=1)
{
TrieNode *base = trie->base;
unsigned char _c = charval(c);
unsigned int to_check = trie->baseSize - _c;
for (unsigned int i=start; i<=to_check; i++)
{
if (base[i+_c].mode == Node_Unused)
{
return i;
}
}
sm_trie_grow(trie);
return x_check(trie, c, to_check+1);
}
unsigned int x_check2(Trie *trie, char c1, char c2, unsigned int start=1)
{
TrieNode *base = trie->base;
unsigned char _c1 = charval(c1);
unsigned char _c2 = charval(c2);
unsigned int to_check = trie->baseSize - (_c1 > _c2 ? _c1 : _c2);
for (unsigned int i=start; i<=to_check; i++)
{
if (base[i+_c1].mode == Node_Unused
&& base[i+_c2].mode == Node_Unused)
{
return i;
}
}
sm_trie_grow(trie);
return x_check2(trie, c1, c2, to_check+1);
}
unsigned int x_check_multi(Trie *trie,
unsigned int offsets[],
unsigned int count,
unsigned int start=1)
{
TrieNode *base = trie->base;
TrieNode *cur;
unsigned int to_check = trie->baseSize;
unsigned int highest = 0;
for (unsigned int i=0; i<count; i++)
{
if (offsets[i] > highest)
{
highest = offsets[i];
}
}
to_check -= highest;
for (unsigned int i=start; i<=to_check; i++)
{
bool okay = true;
for (unsigned int j=0; j<count; j++)
{
cur = &base[i+offsets[j]];
if (cur->mode != Node_Unused)
{
okay = false;
break;
}
}
if (okay)
{
return i;
}
}
sm_trie_grow(trie);
return x_check_multi(trie, offsets, count, to_check+1);
}
unsigned int x_addstring(Trie *trie, const char *ptr)
{
size_t len = strlen(ptr) + 1;
if (trie->tail + len >= trie->stSize)
{
while (trie->tail + len >= trie->stSize)
{
trie->stSize *= 2;
}
trie->stringtab = (char *)realloc(trie->stringtab, trie->stSize);
}
unsigned int tail = trie->tail;
strcpy(&trie->stringtab[tail], ptr);
trie->tail += len;
return tail;
}
Trie *sm_trie_create()
{
Trie *t = new Trie;
t->base = (TrieNode *)malloc(sizeof(TrieNode) * (256 + 1));
t->stringtab = (char *)malloc(sizeof(char) * 256);
t->baseSize = 256;
t->stSize = 256;
sm_trie_clear(t);
return t;
}
void sm_trie_clear(Trie *trie)
{
trie->tail = 0;
memset(trie->base, 0, sizeof(TrieNode) * (trie->baseSize + 1));
memset(trie->stringtab, 0, sizeof(char) * trie->stSize);
/* Sentinel root node */
trie->base[1].idx = 1;
trie->base[1].mode = Node_Arc;
trie->base[1].parent = 1;
}
void sm_trie_destroy(Trie *trie)
{
free(trie->base);
free(trie->stringtab);
delete trie;
}
bool sm_trie_delete(Trie *trie, const char *key)
{
unsigned int lastidx = 1; /* the last node index */
unsigned int curidx; /* current node index */
const char *keyptr = key; /* input stream at current token */
TrieNode *node = NULL; /* current node being processed */
TrieNode *base = trie->base;
if (!*key)
{
return false;
}
/* Start traversing at the root node */
do
{
/* Find where the next character is, then advance */
curidx = base[lastidx].idx;
node = &base[curidx];
curidx += charval(*keyptr);
node = &base[curidx];
keyptr++;
/* Check if this slot is supposed to be empty or is a collision */
if ((curidx > trie->baseSize) || node->mode == Node_Unused || node->parent != lastidx)
{
return false;
} else if (node->mode == Node_Term) {
char *term = &trie->stringtab[node->idx];
if (strcmp(keyptr, term) == 0)
{
break;
}
}
lastidx = curidx;
} while (*keyptr != '\0');
assert(node != NULL);
if (!node->valset)
{
return false;
}
node->valset = false;
node->value = NULL;
return true;
}
bool sm_trie_retrieve(Trie *trie, const char *key, void **value)
{
unsigned int lastidx = 1; /* the last node index */
unsigned int curidx; /* current node index */
const char *keyptr = key; /* input stream at current token */
TrieNode *node = NULL; /* current node being processed */
TrieNode *base = trie->base;
if (!*key)
{
return false;
}
/* Start traversing at the root node */
do
{
/* Find where the next character is, then advance */
curidx = base[lastidx].idx;
node = &base[curidx];
curidx += charval(*keyptr);
node = &base[curidx];
keyptr++;
/* Check if this slot is supposed to be empty or is a collision */
if ((curidx > trie->baseSize) || node->mode == Node_Unused || node->parent != lastidx)
{
return false;
} else if (node->mode == Node_Term) {
char *term = &trie->stringtab[node->idx];
if (strcmp(keyptr, term) == 0)
{
break;
}
}
lastidx = curidx;
} while (*keyptr != '\0');
assert(node != NULL);
if (!node->valset)
{
return false;
}
if (value)
{
*value = node->value;
}
return true;
}
bool sm_trie_insert(Trie *trie, const char *key, void *value)
{
unsigned int lastidx = 1; /* the last node index */
unsigned int curidx; /* current node index */
const char *keyptr = key; /* input stream at current token */
TrieNode *node = NULL; /* current node being processed */
TrieNode *basenode = NULL; /* current base node being processed */
unsigned int q; /* temporary var for x_check results */
TrieNode *base = trie->base;
unsigned int curoffs; /* current offset */
/* Do not handle empty strings for simplicity */
if (!*key)
{
return false;
}
/* Start traversing at the root node (1) */
do
{
/* Find where the next character is, then advance */
curidx = base[lastidx].idx;
basenode = &base[curidx];
curoffs = charval(*keyptr);
curidx += curoffs;
node = &base[curidx];
keyptr++;
/* Check if this slot is supposed to be empty. If so, we need to handle CASES 1/2:
* Insertion without collisions
*/
if ( (curidx > trie->baseSize) || (node->mode == Node_Unused) )
{
if (curidx > trie->baseSize)
{
if (!sm_trie_grow(trie))
{
return false;
}
node = &trie->base[curidx];
}
node->parent = lastidx;
if (*keyptr == '\0')
{
node->mode = Node_Arc;
} else {
node->idx = x_addstring(trie, keyptr);
node->mode = Node_Term;
}
node->valset = true;
node->value = value;
return true;
} else if (node->parent != lastidx) {
/* Collision! We have to split up the tree here. CASE 4:
* Insertion when a new word is inserted with a collision.
* NOTE: This is the hardest case to handle. All below examples are based on:
* BACHELOR, BADGE, inserting BABY.
* The problematic production here is A -> B, where B is already being used.
*
* This process has to rotate one half of the 'A' arc. We generate two lists:
* Outgoing Arcs - Anything leaving this 'A'
* Incoming Arcs - Anything going to this 'A'
* Whichever list is smaller will be moved. Note that this works because the intersection
* affects both arc chains, and moving one will make the slot available to either.
*/
TrieNode *cur;
/* Find every node arcing from the last node.
* I.e. for BACHELOR, BADGE, BABY,
* The arcs leaving A will be C and D, but our current node is B -> *.
* Thus, we use the last index (A) to find the base for arcs leaving A.
*/
unsigned int outgoing_base = base[lastidx].idx;
unsigned int outgoing_list[256];
unsigned int outgoing_count = 0; /* count the current index here */
cur = &base[outgoing_base] + 1;
unsigned int outgoing_limit = 255;
if (outgoing_base + outgoing_limit > trie->baseSize)
{
outgoing_limit = trie->baseSize - outgoing_base;
}
for (unsigned int i=1; i<=outgoing_limit; i++,cur++)
{
if (cur->mode == Node_Unused || cur->parent != lastidx)
{
continue;
}
outgoing_list[outgoing_count++] = i;
}
outgoing_list[outgoing_count++] = curidx - outgoing_base;
/* Now we need to find all the arcs leaving our parent...
* Note: the inconsistency is the base of our parent.
*/
assert(base[node->parent].mode == Node_Arc);
unsigned int incoming_list[256];
unsigned int incoming_base = base[node->parent].idx;
unsigned int incoming_count = 0;
unsigned int incoming_limit = 255;
cur = &base[incoming_base] + 1;
if (incoming_base + incoming_limit > trie->baseSize)
{
incoming_limit = trie->baseSize - incoming_base;
}
assert(incoming_limit > 0 && incoming_limit <= 255);
for (unsigned int i=1; i<=255; i++,cur++)
{
if (cur->mode == Node_Arc || cur->mode == Node_Term)
{
if (cur->parent == node->parent)
{
incoming_list[incoming_count++] = i;
}
}
}
if (incoming_count < outgoing_count + 1)
{
unsigned int q = x_check_multi(trie, incoming_list, incoming_count);
base = trie->base;
node = &base[curidx];
/* If we're incoming, we need to modify our parent */
base[node->parent].idx = q;
/* For each node in the "to move" list,
* Relocate the node's info to the new position.
*/
unsigned int idx, newidx, oldidx;
for (unsigned int i=0; i<incoming_count; i++)
{
idx = incoming_list[i];
newidx = q + idx;
oldidx = incoming_base + idx;
if (oldidx == lastidx)
{
/* Important! Make sure we're not invalidating our sacred lastidx */
lastidx = newidx;
}
base[newidx] = base[oldidx];
assert(base[base[newidx].parent].mode == Node_Arc);
memset(&base[oldidx], 0, sizeof(TrieNode));
/* If we are not a terminator, we have children we must take care of */
if (base[newidx].mode == Node_Arc)
{
TrieNode *check_base = &base[base[newidx].idx] + 1;
for (unsigned int j=1; j<=255; j++, check_base++)
{
if (check_base->parent == oldidx)
{
check_base->parent = newidx;
}
}
}
}
} else {
unsigned int q = x_check_multi(trie, outgoing_list, outgoing_count);
base = trie->base;
node = &base[curidx];
/* If we're outgoing, we need to modify our own base */
base[lastidx].idx = q;
/* Take the last index (curidx) out of the list. Technically we are not moving this,
* since it's already being used by something else.
*/
outgoing_count--;
/* For each node in the "to move" list,
* Relocate the node's info to the new position.
*/
unsigned int idx, newidx, oldidx;
for (unsigned int i=0; i<outgoing_count; i++)
{
idx = outgoing_list[i];
newidx = q + idx;
oldidx = outgoing_base + idx;
if (oldidx == lastidx)
{
/* Important! Make sure we're not invalidating our sacred lastidx */
lastidx = newidx;
}
base[newidx] = base[oldidx];
assert(base[base[newidx].parent].mode == Node_Arc);
memset(&base[oldidx], 0, sizeof(TrieNode));
/* If we are not a terminator, we have children we must take care of */
if (base[newidx].mode == Node_Arc)
{
TrieNode *check_base = &base[base[newidx].idx] + 1;
for (unsigned int j=1; j<=255; j++, check_base++)
{
if (check_base->parent == oldidx)
{
check_base->parent = newidx;
}
}
}
}
/* Take the invisible node and use it as our new node */
node = &base[q + outgoing_list[outgoing_count]];
}
/* We're finally done! */
node->parent = lastidx;
if (*keyptr == '\0')
{
node->mode = Node_Arc;
} else {
node->idx = x_addstring(trie, keyptr);
node->mode = Node_Term;
}
node->valset = true;
node->value = value;
return true;
} else {
/* See what's in the next node - special case if terminator! */
if (node->mode == Node_Term)
{
/* If we're a terminator, we need to handle CASE 3:
* Insertion when a terminating collision occurs
*/
char *term = &trie->stringtab[node->idx];
/* Do an initial browsing to make sure they're not the same string */
if (strcmp(keyptr, term) == 0)
{
if (!node->valset)
{
node->valset = true;
node->value = value;
return true;
}
/* Same string. We can't insert. */
return false;
}
/* For each matching character pair, we need to disband the terminator.
* This splits the similar prefix into a single arc path.
* First, save the old values so we can move them to a new node.
* Next, for each loop:
* Take the current (invalid) node, and point it to the next arc base.
* Set the current node to the node at the next arc.
*/
void *oldvalue = node->value;
bool oldvalset = node->valset;
if (*term == *keyptr)
{
while (*term == *keyptr)
{
/* Find the next free slot in the check array.
* This is the "vector base" essentially
*/
q = x_check(trie, *term);
base = trie->base;
node = &base[curidx];
/* Point the node to the next new base */
node->idx = q;
node->mode = Node_Arc;
node->valset = false;
/* Advance the input stream and local variables */
lastidx = curidx;
curidx = q + charval(*term);
node = &base[curidx];
/* Make sure the new current node has its parent set. */
node->parent = lastidx;
node->mode = Node_Arc; /* Just in case we run x_check again */
*term = '\0'; /* Unmark the string table here */
term++;
keyptr++;
}
} else {
node->valset = false;
}
/* We're done inserting new pairs. If one of them is exhausted,
* we take special shortcuts.
*/
if (*term == '\0') //EX: BADGERHOUSE added over B -> ADGER.
{
/* First backpatch the current node - it ends the newly split terminator.
* In the example, this would mean the node is the production from R -> ?
* This node ends the old BADGER, so we set it here.
*/
node->valset = oldvalset;
node->value = oldvalue;
/* The terminator was split up, but pieces of keyptr remain.
* We need to generate a new production, in this example, R -> H,
* with H being a terminator to OUSE. Thus we get:
* B,A,D,G,E,R*,H*->OUSE (* = value set).
* NOTE: parent was last set at the end of the while loop.
*/
/* Get the new base and apply re-basing */
q = x_check(trie, *keyptr);
base = trie->base;
node = &base[curidx];
node->idx = q;
node->mode = Node_Arc;
lastidx = curidx;
/* Finish the final node */
curidx = q + charval(*keyptr);
node = &trie->base[curidx];
keyptr++;
/* Optimize - don't add to string table if there's nothing more to eat */
if (*keyptr == '\0')
{
node->mode = Node_Arc;
} else {
node->idx = x_addstring(trie, keyptr);
node->mode = Node_Term;
}
node->parent = lastidx;
node->valset = true;
node->value = value;
} else if (*keyptr == '\0') { //EX: BADGER added over B -> ADGERHOUSE
/* First backpatch the current node - it ends newly split input string.
* This is the exact opposite of the above procedure.
*/
node->valset = true;
node->value = value;
/* Get the new base and apply re-basing */
q = x_check(trie, *term);
base = trie->base;
node = &base[curidx];
node->idx = q;
node->mode = Node_Arc;
lastidx = curidx;
/* Finish the final node */
curidx = q + charval(*term);
node = &trie->base[curidx];
term++;
/* Optimize - don't add to string table if there's nothing more to eat */
if (*term == '\0')
{
node->mode = Node_Arc;
} else {
node->idx = (term - trie->stringtab); /* Already in the string table! */
node->mode = Node_Term;
}
node->parent = lastidx;
node->valset = oldvalset;
node->value = oldvalue;
} else {
/* Finally, we have to create two new nodes instead of just one. */
node->mode = Node_Arc;
/* Get the new base and apply re-basing */
q = x_check2(trie, *keyptr, *term);
base = trie->base;
node = &base[curidx];
node->idx = q;
lastidx = curidx;
/* Re-create the old terminated node */
curidx = q + charval(*term);
node = &trie->base[curidx];
term++;
node->valset = oldvalset;
node->value = oldvalue;
node->parent = lastidx;
if (*term == '\0')
{
node->mode = Node_Arc;
} else {
node->mode = Node_Term;
node->idx = (term - trie->stringtab); /* Already in the string table! */
}
/* Create the new keyed input node */
curidx = q + charval(*keyptr);
node = &trie->base[curidx];
keyptr++;
node->valset = true;
node->value = value;
node->parent = lastidx;
if (*keyptr == '\0')
{
node->mode = Node_Arc;
} else {
node->mode = Node_Term;
node->idx = x_addstring(trie, keyptr);
}
}
/* Phew! */
return true;
} else {
assert(node->mode == Node_Arc);
}
}
lastidx = curidx;
} while (*keyptr != '\0');
assert(node);
/* If we've exhausted the string and we have a valid reached node,
* the production rule already existed. Make sure it's valid to set first.
*/
/* We have to be an Arc. If the last result was anything else, we would have returned a new
* production earlier.
*/
assert(node->mode == Node_Arc);
/* Furthermore, if this is the last arc label in an arc, we should have a value set. */
if (node->idx == 0)
{
assert(node->valset == true);
}
if (!node->valset)
{
/* Insert is only possible if we have no production */
node->valset = true;
node->value = value;
return true;
}
return false;
}