Various minor things done to project files Updated sample extension project file and updated makefile to the new unified version (more changes likely on the way) Updated regex project file and makefile --HG-- extra : convert_revision : svn%3A39bc706e-5318-0410-9160-8a85361fbb7c/trunk%401971
992 lines
29 KiB
992 lines
29 KiB
* vim: set ts=4 :
* =============================================================================
* SourceMod
* Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved.
* =============================================================================
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, version 3.0, as published by the
* Free Software Foundation.
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <>.
* As a special exception, AlliedModders LLC gives you permission to link the
* code of this program (as well as its derivative works) to "Half-Life 2," the
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
* by the Valve Corporation. You must obey the GNU General Public License in
* all respects for all other code used. Additionally, AlliedModders LLC grants
* this exception to all derivative works. AlliedModders LLC defines further
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
* or <>.
* Version: $Id$
#include <new>
#include <string.h>
#include <malloc.h>
#include <assert.h>
enum NodeType
Node_Unused = 0, /* Node is not being used (sparse) */
Node_Arc, /* Node is part of an arc and does not terminate */
Node_Term, /* Node is a terminator */
* @brief Trie class for storing key/value pairs, based on double array tries.
* @file sm_trie_tpl.h
* For full works cited and implementation overview, there is a big comment
* block at the bottom of this file.
template <typename K>
class KTrie
class KTrieNode;
* @brief Clears all set objects in the trie.
void clear()
* @brief Removes a key from the trie.
* @param key Key to remove.
* @return True on success, false if key was never set.
bool remove(const char *key)
KTrieNode *node = internal_retrieve(key);
if (!node || !node->valset)
return false;
node->valset = false;
return true;
* @brief Retrieves a pointer to the object stored at a given key.
* @param key Key to retrieve.
* @return Pointer to object, or NULL if key was not found or not set.
K * retrieve(const char *key)
KTrieNode *node = internal_retrieve(key);
if (!node || !node->valset)
return NULL;
return &node->value;
* @brief Inserts or updates the object stored at a key.
* @param key Key to update or insert.
* @param obj Object to store at the key.
* @return True on success, false on failure.
bool replace(const char *key, const K & obj)
KTrieNode *prev_node = internal_retrieve(key);
if (!prev_node)
return insert(key, obj);
if (prev_node->valset)
new (&prev_node->value) K(obj);
return true;
* @brief Inserts an object at a key.
* @param key Key to insert at.
* @param obj Object to store at the key.
* @return True on success, false if the key is already set or
* insertion otherwise failed.
bool insert(const char *key, const K & obj)
unsigned int lastidx = 1; /* the last node index */
unsigned int curidx; /* current node index */
const char *keyptr = key; /* input stream at current token */
KTrieNode *node = NULL; /* current node being processed */
KTrieNode *basenode = NULL; /* current base node being processed */
unsigned int q; /* temporary var for x_check results */
unsigned int curoffs; /* current offset */
* Empty strings are a special case, since there are no productions. We could
* probably rework it to use BASE[0] but this hack is easier.
if (*key == '\0')
if (m_empty != NULL && m_empty->valset)
return false;
if (m_empty == NULL)
m_empty = (KTrieNode *)malloc(sizeof(KTrieNode));
m_empty->valset = true;
new (&m_empty->value) K(obj);
return true;
/* Start traversing at the root node (1) */
/* Find where the next character is, then advance */
curidx = m_base[lastidx].idx;
basenode = &m_base[curidx];
curoffs = charval(*keyptr);
curidx += curoffs;
node = &m_base[curidx];
/* Check if this slot is supposed to be empty. If so, we need to handle CASES 1/2:
* Insertion without collisions
if ( (curidx > m_baseSize) || (node->mode == Node_Unused) )
if (curidx > m_baseSize)
if (!grow())
return false;
node = &m_base[curidx];
node->parent = lastidx;
if (*keyptr == '\0')
node->mode = Node_Arc;
node->idx = x_addstring(keyptr);
node->mode = Node_Term;
node->valset = true;
new (&node->value) K(obj);
return true;
else if (node->parent != lastidx)
/* Collision! We have to split up the tree here. CASE 4:
* Insertion when a new word is inserted with a collision.
* NOTE: This is the hardest case to handle. All below examples are based on:
* BACHELOR, BADGE, inserting BABY.
* The problematic production here is A -> B, where B is already being used.
* This process has to rotate one half of the 'A' arc. We generate two lists:
* Outgoing Arcs - Anything leaving this 'A'
* Incoming Arcs - Anything going to this 'A'
* Whichever list is smaller will be moved. Note that this works because the intersection
* affects both arc chains, and moving one will make the slot available to either.
KTrieNode *cur;
/* Find every node arcing from the last node.
* The arcs leaving A will be C and D, but our current node is B -> *.
* Thus, we use the last index (A) to find the base for arcs leaving A.
unsigned int outgoing_base = m_base[lastidx].idx;
unsigned int outgoing_list[256];
unsigned int outgoing_count = 0; /* count the current index here */
cur = &m_base[outgoing_base] + 1;
unsigned int outgoing_limit = 255;
if (outgoing_base + outgoing_limit > m_baseSize)
outgoing_limit = m_baseSize - outgoing_base;
for (unsigned int i=1; i<=outgoing_limit; i++,cur++)
if (cur->mode == Node_Unused || cur->parent != lastidx)
outgoing_list[outgoing_count++] = i;
outgoing_list[outgoing_count++] = curidx - outgoing_base;
/* Now we need to find all the arcs leaving our parent...
* Note: the inconsistency is the base of our parent.
assert(m_base[node->parent].mode == Node_Arc);
unsigned int incoming_list[256];
unsigned int incoming_base = m_base[node->parent].idx;
unsigned int incoming_count = 0;
unsigned int incoming_limit = 255;
cur = &m_base[incoming_base] + 1;
if (incoming_base + incoming_limit > m_baseSize)
incoming_limit = m_baseSize - incoming_base;
assert(incoming_limit > 0 && incoming_limit <= 255);
for (unsigned int i=1; i<=incoming_limit; i++,cur++)
if (cur->mode == Node_Arc || cur->mode == Node_Term)
if (cur->parent == node->parent)
incoming_list[incoming_count++] = i;
if (incoming_count < outgoing_count + 1)
unsigned int q = x_check_multi(incoming_list, incoming_count);
node = &m_base[curidx];
/* If we're incoming, we need to modify our parent */
m_base[node->parent].idx = q;
/* For each node in the "to move" list,
* Relocate the node's info to the new position.
unsigned int idx, newidx, oldidx;
for (unsigned int i=0; i<incoming_count; i++)
idx = incoming_list[i];
newidx = q + idx;
oldidx = incoming_base + idx;
if (oldidx == lastidx)
/* Important! Make sure we're not invalidating our sacred lastidx */
lastidx = newidx;
/* Fully copy the node */
memcpy(&m_base[newidx], &m_base[oldidx], sizeof(KTrieNode));
if (m_base[oldidx].valset)
new (&m_base[newidx].value) K(m_base[oldidx].value);
assert(m_base[m_base[newidx].parent].mode == Node_Arc);
/* Erase old data */
memset(&m_base[oldidx], 0, sizeof(KTrieNode));
/* If we are not a terminator, we have children we must take care of */
if (m_base[newidx].mode == Node_Arc)
KTrieNode *check_base = &m_base[m_base[newidx].idx] + 1;
outgoing_limit = (m_base + m_baseSize + 1) - check_base;
if (outgoing_limit > 255)
outgoing_limit = 255;
for (unsigned int j=1; j<=outgoing_limit; j++, check_base++)
if (check_base->parent == oldidx)
check_base->parent = newidx;
unsigned int q = x_check_multi(outgoing_list, outgoing_count);
node = &m_base[curidx];
/* If we're outgoing, we need to modify our own base */
m_base[lastidx].idx = q;
/* Take the last index (curidx) out of the list. Technically we are not moving this,
* since it's already being used by something else.
/* For each node in the "to move" list,
* Relocate the node's info to the new position.
unsigned int idx, newidx, oldidx;
for (unsigned int i=0; i<outgoing_count; i++)
idx = outgoing_list[i];
newidx = q + idx;
oldidx = outgoing_base + idx;
if (oldidx == lastidx)
/* Important! Make sure we're not invalidating our sacred lastidx */
lastidx = newidx;
/* Fully copy the node */
memcpy(&m_base[newidx], &m_base[oldidx], sizeof(KTrieNode));
if (m_base[oldidx].valset)
new (&m_base[newidx].value) K(m_base[oldidx].value);
assert(m_base[m_base[newidx].parent].mode == Node_Arc);
/* Erase old data */
memset(&m_base[oldidx], 0, sizeof(KTrieNode));
/* If we are not a terminator, we have children we must take care of */
if (m_base[newidx].mode == Node_Arc)
KTrieNode *check_base = &m_base[m_base[newidx].idx] + 1;
outgoing_limit = (m_base + m_baseSize + 1) - check_base;
if (outgoing_limit > 255)
outgoing_limit = 255;
for (unsigned int j=1; j<=outgoing_limit; j++, check_base++)
if (check_base->parent == oldidx)
check_base->parent = newidx;
/* Take the invisible node and use it as our new node */
node = &m_base[q + outgoing_list[outgoing_count]];
/* We're finally done! */
node->parent = lastidx;
if (*keyptr == '\0')
node->mode = Node_Arc;
node->idx = x_addstring(keyptr);
node->mode = Node_Term;
node->valset = true;
new (&node->value) K(obj);
return true;
/* See what's in the next node - special case if terminator! */
if (node->mode == Node_Term)
/* If we're a terminator, we need to handle CASE 3:
* Insertion when a terminating collision occurs
char *term = &m_stringtab[node->idx];
/* Do an initial browsing to make sure they're not the same string */
if (strcmp(keyptr, term) == 0)
if (!node->valset)
node->valset = true;
new (&node->value) K(obj);
return true;
/* Same string. We can't insert. */
return false;
/* For each matching character pair, we need to disband the terminator.
* This splits the similar prefix into a single arc path.
* First, save the old values so we can move them to a new node.
* Next, for each loop:
* Take the current (invalid) node, and point it to the next arc base.
* Set the current node to the node at the next arc.
K oldvalue;
bool oldvalset = node->valset;
if (oldvalset)
oldvalue = node->value;
if (*term == *keyptr)
while (*term == *keyptr)
/* Find the next free slot in the check array.
* This is the "vector base" essentially
q = x_check(*term);
node = &m_base[curidx];
/* Point the node to the next new base */
node->idx = q;
node->mode = Node_Arc;
if (node->valset == true)
node->valset = false;
/* Advance the input stream and local variables */
lastidx = curidx;
curidx = q + charval(*term);
node = &m_base[curidx];
/* Make sure the new current node has its parent set. */
node->parent = lastidx;
node->mode = Node_Arc; /* Just in case we run x_check again */
*term = '\0'; /* Unmark the string table here */
else if (node->valset)
node->valset = false;
/* We're done inserting new pairs. If one of them is exhausted,
* we take special shortcuts.
if (*term == '\0') //EX: BADGERHOUSE added over B -> ADGER.
/* First backpatch the current node - it ends the newly split terminator.
* In the example, this would mean the node is the production from R -> ?
* This node ends the old BADGER, so we set it here.
node->valset = oldvalset;
if (node->valset)
new (&node->value) K(oldvalue);
/* The terminator was split up, but pieces of keyptr remain.
* We need to generate a new production, in this example, R -> H,
* with H being a terminator to OUSE. Thus we get:
* B,A,D,G,E,R*,H*->OUSE (* = value set).
* NOTE: parent was last set at the end of the while loop.
/* Get the new base and apply re-basing */
q = x_check(*keyptr);
node = &m_base[curidx];
node->idx = q;
node->mode = Node_Arc;
lastidx = curidx;
/* Finish the final node */
curidx = q + charval(*keyptr);
node = &m_base[curidx];
/* Optimize - don't add to string table if there's nothing more to eat */
if (*keyptr == '\0')
node->mode = Node_Arc;
node->idx = x_addstring(keyptr);
node->mode = Node_Term;
node->parent = lastidx;
node->valset = true;
new (&node->value) K(obj);
else if (*keyptr == '\0')
{ //EX: BADGER added over B -> ADGERHOUSE
/* First backpatch the current node - it ends newly split input string.
* This is the exact opposite of the above procedure.
node->valset = true;
new (&node->value) K(obj);
/* Get the new base and apply re-basing */
q = x_check(*term);
node = &m_base[curidx];
node->idx = q;
node->mode = Node_Arc;
lastidx = curidx;
/* Finish the final node */
curidx = q + charval(*term);
node = &m_base[curidx];
/* Optimize - don't add to string table if there's nothing more to eat */
if (*term == '\0')
node->mode = Node_Arc;
node->idx = (term - m_stringtab); /* Already in the string table! */
node->mode = Node_Term;
node->parent = lastidx;
node->valset = oldvalset;
if (node->valset)
new (&node->value) K(oldvalue);
/* Finally, we have to create two new nodes instead of just one. */
node->mode = Node_Arc;
/* Get the new base and apply re-basing */
q = x_check2(*keyptr, *term);
node = &m_base[curidx];
node->idx = q;
lastidx = curidx;
/* Re-create the old terminated node */
curidx = q + charval(*term);
node = &m_base[curidx];
node->valset = oldvalset;
if (node->valset)
new (&node->value) K(oldvalue);
node->parent = lastidx;
if (*term == '\0')
node->mode = Node_Arc;
node->mode = Node_Term;
node->idx = (term - m_stringtab); /* Already in the string table! */
/* Create the new keyed input node */
curidx = q + charval(*keyptr);
node = &m_base[curidx];
node->valset = true;
new (&node->value) K(obj);
node->parent = lastidx;
if (*keyptr == '\0')
node->mode = Node_Arc;
node->mode = Node_Term;
node->idx = x_addstring(keyptr);
/* Phew! */
return true;
assert(node->mode == Node_Arc);
lastidx = curidx;
} while (*keyptr != '\0');
/* If we've exhausted the string and we have a valid reached node,
* the production rule already existed. Make sure it's valid to set first.
/* We have to be an Arc. If the last result was anything else, we would have returned a new
* production earlier.
assert(node->mode == Node_Arc);
if (!node->valset)
node->valset = true;
new (&node->value) K(obj);
return true;
return false;
m_base = (KTrieNode *)malloc(sizeof(KTrieNode) * (256 + 1));
m_stringtab = (char *)malloc(sizeof(char) * 256);
m_baseSize = 256;
m_stSize = 256;
m_empty = NULL;
m_numElements = 0;
if (m_empty != NULL && m_empty->valset)
m_empty->valset = false;
void run_destructor(void (*dtor)(K * ptr))
for (size_t i = 0; i <= m_baseSize; i++)
if (m_base[i].valset)
m_base[i].valset = false;
class KTrieNode
friend class KTrie;
* For Node_Arc, this index stores the 'base' offset to the next arc chain.
* I.e. to jump from this arc to character C, it will be at base[idx+C].
* For Node_Term, this is an index into the string table.
unsigned int idx;
* This contains the prior arc that we must have come from.
* For example, if arc 63 has a base jump of index 12, and we want to see if
* there is a valid character C, the parent of 12+C must be 63.
unsigned int parent;
K value; /* Value associated with this node */
NodeType mode; /* Current usage type of the node */
bool valset; /* Whether or not a value is set */
KTrieNode *internal_retrieve(const char *key)
unsigned int lastidx = 1; /* the last node index */
unsigned int curidx; /* current node index */
const char *keyptr = key; /* input stream at current token */
KTrieNode *node = NULL; /* current node being processed */
if (!*key)
return m_empty;
/* Start traversing at the root node */
/* Find where the next character is, then advance */
curidx = m_base[lastidx].idx;
node = &m_base[curidx];
curidx += charval(*keyptr);
node = &m_base[curidx];
/* Check if this slot is supposed to be empty or is a collision */
if ((curidx > m_baseSize) || node->mode == Node_Unused || node->parent != lastidx)
return NULL;
else if (node->mode == Node_Term)
char *term = &m_stringtab[node->idx];
if (strcmp(keyptr, term) == 0)
return NULL;
lastidx = curidx;
} while (*keyptr != '\0');
return node;
bool grow()
/* The current # of nodes in the tree is trie->baseSize + 1 */
unsigned int cur_size = m_baseSize;
unsigned int new_size = cur_size * 2;
KTrieNode *new_base = (KTrieNode *)malloc((new_size + 1) * sizeof(KTrieNode));
if (!new_base)
return false;
memcpy(new_base, m_base, sizeof(KTrieNode) * (m_baseSize + 1));
memset(&new_base[cur_size + 1], 0, (new_size - cur_size) * sizeof(KTrieNode));
for (size_t i = 0; i <= m_baseSize; i++)
if (m_base[i].valset)
/* Placement construct+copy the object, then placement destroy the old. */
new (&new_base[i].value) K(m_base[i].value);
m_base = new_base;
m_baseSize = new_size;
return true;
inline unsigned char charval(char c)
return (unsigned char)c;
void internal_clear()
m_tail = 0;
m_numElements = 0;
memset(m_base, 0, sizeof(KTrieNode) * (m_baseSize + 1));
memset(m_stringtab, 0, sizeof(char) * m_stSize);
/* Sentinel root node */
m_base[1].idx = 1;
m_base[1].mode = Node_Arc;
m_base[1].parent = 1;
void run_destructors()
for (size_t i = 0; i <= m_baseSize; i++)
if (m_base[i].valset)
unsigned int x_addstring(const char *ptr)
size_t len = strlen(ptr) + 1;
if (m_tail + len >= m_stSize)
while (m_tail + len >= m_stSize)
m_stSize *= 2;
m_stringtab = (char *)realloc(m_stringtab,m_stSize);
unsigned int tail = m_tail;
strcpy(&m_stringtab[tail], ptr);
m_tail += len;
return tail;
unsigned int x_check(char c, unsigned int start=1)
unsigned char _c = charval(c);
unsigned int to_check = m_baseSize - _c;
for (unsigned int i=start; i<=to_check; i++)
if (m_base[i+_c].mode == Node_Unused)
return i;
return x_check(c, to_check+1);
unsigned int x_check2(char c1, char c2, unsigned int start=1)
unsigned char _c1 = charval(c1);
unsigned char _c2 = charval(c2);
unsigned int to_check = m_baseSize - (_c1 > _c2 ? _c1 : _c2);
for (unsigned int i=start; i<=to_check; i++)
if (m_base[i+_c1].mode == Node_Unused
&& m_base[i+_c2].mode == Node_Unused)
return i;
return x_check2(c1, c2, to_check+1);
unsigned int x_check_multi(
unsigned int offsets[],
unsigned int count,
unsigned int start=1)
KTrieNode *cur;
unsigned int to_check = m_baseSize;
unsigned int highest = 0;
for (unsigned int i=0; i<count; i++)
if (offsets[i] > highest)
highest = offsets[i];
to_check -= highest;
for (unsigned int i=start; i<=to_check; i++)
bool okay = true;
for (unsigned int j=0; j<count; j++)
cur = &m_base[i+offsets[j]];
if (cur->mode != Node_Unused)
okay = false;
if (okay)
return i;
return x_check_multi(offsets, count, to_check+1);
size_t mem_usage()
return (sizeof(KTrieNode) * (m_baseSize))
+ m_stSize
+ sizeof(KTrieNode);
size_t size()
return m_numElements;
KTrieNode *m_base; /* Base array for the sparse tables */
KTrieNode *m_empty; /* Special case for empty strings */
char *m_stringtab; /* String table pointer */
unsigned int m_baseSize; /* Size of the base array, in members */
unsigned int m_stSize; /* Size of the string table, in bytes */
unsigned int m_tail; /* Current unused offset into the string table */
size_t m_numElements; /* Number of elements in use */
* Double Array Trie algorithm, based on:
* An Efficient Implementation of Trie Structures, by
* Jun-ichi Aoe and Katsushi Maromoto, and Takashi Sato
* from Software - Practice and Experience, Vol. 22(9), 695-721 (September 1992)
* A Trie is a simple data structure which stores strings as DFAs, with each
* transition state being a string entry. For example, observe the following strings:
* These transition as the follow production rules:
* B -> ... B
* A -> ... BA
* I -> ... BAI
* T -> ... BAT
* C -> BAC
* O -> ... BACO
* The standard implementation for this - using lists - gives a slow linear lookup, somewhere between
* O(N+M) or O(log n). A faster implementation is proposed in the paper above, which is based on compacting
* the transition states into two arrays. In the paper's implementation, two arrays are used, and thus it is
* called the "Double Array" algorithm. However, the CHECK array's size is maintained the same as BASE,
* so they can be combined into one structure. The array seems complex at first, but is very simple: it is a
* tree structure flattened out into a single vector. I am calling this implementation the Flat Array Trie.
* BASE[] is an array where each member is a node in the Trie. The node can either be UNUSED (empty), an ARC
* (containing an offset to the next set of ARCs), or a TERMINATOR (contains the rest of a string).
* Each node has an index which must be interpreted based on the node type. If the node is a TERMINATOR, then the
* index is an index into a string table, to find the rest of the string.
* If the node is an ARC, the index is another index into BASE. For each possible token that can follow the
* current token, the value of those tokens can be added to the index given in the ARC. Thus, given a current
* position and the next desired token, the current arc will jump to another arc which can contain either:
* 1) An invalid production (collision, no entry exists)
* 2) An empty production (no entry exists)
* 3) Another arc label (the string ends here or continues into more productions)
* 4) A TERMINATOR (the string ends here and contains an unused set of productions)
* So, given current offset N (starting at N=1), jumping to token C means the next offset will be:
* offs = BASE[n] + C
* Thus, the next node will be at:
* BASE[BASE[n] + C]
* This allows each ARC to specify the base offset for any of its ARC children, like a tree. Each node specifies
* its parent ARC -- so if an invalid offset is specified, the parent will not match, and thus no such derived
* string exists.
* This means that arrays can be laid out "sparsely," maximizing their usage. Note that N need not be related to
* the range of tokens (1-256). I.e., a base index does not have to be at 1, 256, 512, et cetera. This is because
* insertion comes with a small deal of complexity. To insert a new set of tokens T, the algorithm finds a new
* BASE index N such that BASE[N+T[i]] is unused for each T[i]. Thus, indirection is not necessarily linear;
* traversing a chain of ARC nodes can _and will_ jump around BASE.
* Of course, given this level of flexibility in the array organization, there are collisions. This is largely
* where insertions become slow, as the old chain must be relocated before the new one is used. Relocation means
* finding one or more new base indexes, and this means traversing BASE until an acceptable index is found, such
* that each offset is unused (see description in previous paragraph).
* However, it is not insertion time we are concerned about. The "trie" name comes from reTRIEval. We are only
* concerned with lookup and deletion. Both lookup and deletion are O(k), where k is relative to the length of the
* input string. Note that it is best case O(1) and worst case O(k). Deleting the entire trie is always O(1).