sourcemod/core/TextParsers.cpp
Scott Ehlert a1009aed38 Updated license headers on virtually all files with extremely minor changes in the name of some sort of strange consistency.
All plugin and include file headers also have been changed to say about GPL3 instead of GPL2.

(This day shall henceforth be known as the Eighty Column Massacre of '07)

--HG--
extra : convert_revision : svn%3A39bc706e-5318-0410-9160-8a85361fbb7c/trunk%401336
2007-08-15 06:19:30 +00:00

1003 lines
23 KiB
C++

/**
* vim: set ts=4 :
* =============================================================================
* SourceMod
* Copyright (C) 2004-2007 AlliedModders LLC. All rights reserved.
* =============================================================================
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, version 3.0, as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, AlliedModders LLC gives you permission to link the
* code of this program (as well as its derivative works) to "Half-Life 2," the
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
* by the Valve Corporation. You must obey the GNU General Public License in
* all respects for all other code used. Additionally, AlliedModders LLC grants
* this exception to all derivative works. AlliedModders LLC defines further
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
* or <http://www.sourcemod.net/license.php>.
*
* Version: $Id$
*/
#include <stdio.h>
#include <ctype.h>
#include <wctype.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "TextParsers.h"
#include "ShareSys.h"
TextParsers g_TextParser;
static int g_ini_chartable1[255] = {0};
static int g_ws_chartable[255] = {0};
bool IsWhitespace(const char *stream)
{
return g_ws_chartable[(unsigned)*stream] == 1;
}
TextParsers::TextParsers()
{
g_ini_chartable1[(unsigned)'_'] = 1;
g_ini_chartable1[(unsigned)'-'] = 1;
g_ini_chartable1[(unsigned)','] = 1;
g_ini_chartable1[(unsigned)'+'] = 1;
g_ini_chartable1[(unsigned)'.'] = 1;
g_ini_chartable1[(unsigned)'$'] = 1;
g_ini_chartable1[(unsigned)'?'] = 1;
g_ini_chartable1[(unsigned)'/'] = 1;
g_ws_chartable[(unsigned)'\n'] = 1;
g_ws_chartable[(unsigned)'\v'] = 1;
g_ws_chartable[(unsigned)'\r'] = 1;
g_ws_chartable[(unsigned)'\t'] = 1;
g_ws_chartable[(unsigned)'\f'] = 1;
g_ws_chartable[(unsigned)' '] = 1;
}
void TextParsers::OnSourceModAllInitialized()
{
g_ShareSys.AddInterface(NULL, this);
}
unsigned int TextParsers::GetUTF8CharBytes(const char *stream)
{
return _GetUTF8CharBytes(stream);
}
/**
* Character streams
*/
struct CharStream
{
const char *curpos;
};
bool CharStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read)
{
CharStream *srdr = (CharStream *)stream;
const char *ptr = srdr->curpos;
for (size_t i=0; i<maxlength; i++)
{
if (*ptr == '\0')
{
break;
}
*buffer++ = *ptr++;
}
*read = ptr - srdr->curpos;
srdr->curpos = ptr;
return true;
}
SMCParseError TextParsers::ParseString_SMC(const char *stream,
ITextListener_SMC *smc,
unsigned int *line,
unsigned int *col)
{
CharStream srdr = { stream };
return ParseStream_SMC(&srdr, CharStreamReader, smc, line, col);
}
/**
* File streams
*/
bool FileStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read)
{
size_t num = fread(buffer, 1, maxlength, (FILE *)stream);
*read = static_cast<unsigned int>(num);
if (num == 0 && feof((FILE *)stream))
{
return true;
}
return (ferror((FILE *)stream) == 0);
}
SMCParseError TextParsers::ParseFile_SMC(const char *file, ITextListener_SMC *smc, unsigned int *line, unsigned int *col)
{
FILE *fp = fopen(file, "rt");
if (!fp)
{
return SMCParse_StreamOpen;
}
SMCParseError result = ParseStream_SMC(fp, FileStreamReader, smc, line, col);
fclose(fp);
return result;
}
/**
* Raw parsing of streams with helper functions
*/
struct StringInfo
{
StringInfo() : quoted(false), ptr(NULL), end(NULL), special(false) { }
bool quoted;
char *ptr;
char *end;
bool special;
};
const char *FixupString(StringInfo &data)
{
if (!data.ptr)
{
return NULL;
}
if (data.quoted)
{
data.ptr++;
}
#if defined _DEBUG
else {
/* A string will never have beginning whitespace because we ignore it in the stream.
* Furthermore, if there is trailing whitespace, the end ptr will point to it, so it is valid
* to overwrite! Lastly, the last character must be whitespace or a comment/invalid character.
*/
}
#endif
/* Do some extra work on strings that have special quoted characters. */
if (data.special)
{
char *outptr = data.ptr;
size_t len = data.end - data.ptr;
if (len >= 2)
{
for (size_t i=0; i<len; i++)
{
if (data.ptr[i] == '\\' && i < len - 1)
{
/* Resolve the next character. */
i++;
if (data.ptr[i] == 'n')
{
data.ptr[i] = '\n';
} else if (data.ptr[i] == 't') {
data.ptr[i] = '\t';
} else if (data.ptr[i] == 'r') {
data.ptr[i] = '\r';
} else if (data.ptr[i] != '\\'
&& data.ptr[i] != '"') {
/* This character is invalid, so go back one */
i--;
}
}
*outptr++ = data.ptr[i];
}
*outptr = '\0';
}
}
if (data.end)
{
*(data.end) = '\0';
}
return data.ptr;
}
const char *rotate(StringInfo info[3])
{
if (info[2].ptr != NULL)
{
return info[2].ptr;
}
if (info[0].ptr != NULL)
{
info[2] = info[1];
info[1] = info[0];
info[0] = StringInfo();
}
return NULL;
}
void scrap(StringInfo info[3])
{
info[2] = StringInfo();
info[1] = StringInfo();
info[0] = StringInfo();
}
void reloc(StringInfo &data, unsigned int bytes)
{
if (data.ptr)
{
data.ptr -= bytes;
}
if (data.end)
{
data.end -= bytes;
}
}
char *lowstring(StringInfo info[3])
{
for (int i=2; i>=0; i--)
{
if (info[i].ptr)
{
return info[i].ptr;
}
}
return NULL;
}
SMCParseError TextParsers::ParseStream_SMC(void *stream,
STREAMREADER srdr,
ITextListener_SMC *smc,
unsigned int *line,
unsigned int *col)
{
char in_buf[4096];
char *reparse_point = NULL;
char *parse_point = in_buf;
char *line_begin = in_buf;
unsigned int read;
unsigned int curline = 1;
unsigned int curtok = 0;
unsigned int curlevel = 0;
bool in_quote = false;
bool ignoring = false;
bool eol_comment = false;
bool ml_comment = false;
unsigned int i;
SMCParseError err = SMCParse_Okay;
SMCParseResult res;
char c;
StringInfo strings[3];
StringInfo emptystring;
smc->ReadSMC_ParseStart();
while (srdr(stream, parse_point, sizeof(in_buf) - (parse_point - line_begin) - 1, &read))
{
if (!read)
{
break;
}
/* :TODO: do this outside of the main loop somehow
* This checks for BOM markings
*/
if (curline == 1 &&
in_buf[0] == (char)0xEF &&
in_buf[1] == (char)0xBB &&
in_buf[2] == (char)0xBF)
{
/* Move EVERYTHING down :\ */
memmove(in_buf, &in_buf[3], read - 3);
read -= 3;
}
if (reparse_point)
{
read += (parse_point - reparse_point);
parse_point = reparse_point;
reparse_point = NULL;
}
for (i=0; i<read; i++)
{
c = parse_point[i];
if (c == '\n')
{
/* If we got a newline, there's a lot of things that could have happened in the interim.
* First, let's make sure the staged strings are rotated.
*/
if (strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCParse_InvalidTokens;
goto failed;
}
}
/* Next, let's clear some line-based values that may no longer have meaning */
eol_comment = false;
in_quote = false;
if (ignoring && !ml_comment)
{
ignoring = false;
}
/* Pass the raw line onto the listener */
if ((res=smc->ReadSMC_RawLine(line_begin, curline)) != SMCParse_Continue)
{
err = (res == SMCParse_HaltFail) ? SMCParse_Custom : SMCParse_Okay;
goto failed;
}
/* Now we check the sanity of our staged strings! */
if (strings[2].ptr)
{
if (!curlevel)
{
err = SMCParse_InvalidProperty1;
goto failed;
}
/* Assume the next string is a property and pass the info on. */
if ((res=smc->ReadSMC_KeyValue(
FixupString(strings[2]),
FixupString(strings[1]),
strings[2].quoted,
strings[1].quoted)) != SMCParse_Continue)
{
err = (res == SMCParse_HaltFail) ? SMCParse_Custom : SMCParse_Okay;
goto failed;
}
scrap(strings);
}
/* Change the states for the next line */
curtok = 0;
curline++;
line_begin = &parse_point[i+1]; //Note: safe because this gets relocated later
} else if (ignoring) {
if (in_quote)
{
/* If i was 0, this case is impossible due to reparsing */
if ((i != 0) && c == '"' && parse_point[i-1] != '\\')
{
/* If we reached a quote in an ignore phase,
* we're staging a string and we must rotate it out.
*/
in_quote = false;
ignoring = false;
/* Set our info */
strings[0].end = &parse_point[i];
strings[0].quoted = true;
if (rotate(strings) != NULL)
{
/* If we rotated too many strings, there was too much crap on one line */
err = SMCParse_InvalidTokens;
goto failed;
}
} else if (c == '\\') {
strings[0].special = true;
if (i == (read - 1))
{
reparse_point = &parse_point[i];
break;
}
}
} else if (ml_comment) {
if (c == '*')
{
/* Check if we need to get more input first */
if (i == read - 1)
{
reparse_point = &parse_point[i];
break;
}
if (parse_point[i+1] == '/')
{
ml_comment = false;
ignoring = false;
/* We should not be staging anything right now. */
assert(strings[0].ptr == NULL);
/* Advance the input stream so we don't choke on this token */
i++;
curtok++;
}
}
}
} else {
/* Check if we're whitespace or not */
if (!g_ws_chartable[(unsigned)c])
{
bool restage = false;
/* Check various special tokens:
* ;
* //
* / *
* {
* }
*/
if (c == ';' || c == '/')
{
/* If it's a line-based comment (that is, ; or //)
* we will need to scrap everything until the end of the line.
*/
if (c == '/')
{
if (i == read - 1)
{
/* If we reached the end of the look-ahead, we need to re-check our input.
* Breaking out will force this to be the new reparse point!
*/
reparse_point = &parse_point[i];
break;
}
if (parse_point[i + 1] == '/')
{
/* standard comment */
ignoring = true;
eol_comment = true;
restage = true;
} else if (parse_point[i+1] == '*') {
/* inline comment - start ignoring */
ignoring = true;
ml_comment = true;
/* yes, we restage, meaning that:
* STR/ *stuff* /ING (space because ml comments don't nest in C++)
* will not generate 'STRING', but rather 'STR' and 'ING'.
* This should be a rare occurrence and is done here for convenience.
*/
restage = true;
}
} else {
ignoring = true;
eol_comment = true;
restage = true;
}
} else if (c == '{') {
/* If we are staging a string, we must rotate here */
if (strings[0].ptr)
{
/* We have unacceptable tokens on this line */
if (rotate(strings) != NULL)
{
err = SMCParse_InvalidSection1;
goto failed;
}
}
/* Sections must always be alone */
if (strings[2].ptr != NULL)
{
err = SMCParse_InvalidSection1;
goto failed;
} else if (strings[1].ptr == NULL) {
err = SMCParse_InvalidSection2;
goto failed;
}
if ((res=smc->ReadSMC_NewSection(FixupString(strings[1]), strings[1].quoted))
!= SMCParse_Continue)
{
err = (res == SMCParse_HaltFail) ? SMCParse_Custom : SMCParse_Okay;
goto failed;
}
strings[1] = emptystring;
curlevel++;
} else if (c == '}') {
/* Unlike our matching friend, this can be on the same line as something prior */
if (rotate(strings) != NULL)
{
err = SMCParse_InvalidSection3;
goto failed;
}
if (strings[2].ptr)
{
if (!curlevel)
{
err = SMCParse_InvalidProperty1;
goto failed;
}
if ((res=smc->ReadSMC_KeyValue(
FixupString(strings[2]),
FixupString(strings[1]),
strings[2].quoted,
strings[1].quoted))
!= SMCParse_Continue)
{
err = (res == SMCParse_HaltFail) ? SMCParse_Custom : SMCParse_Okay;
goto failed;
}
} else if (strings[1].ptr) {
err = SMCParse_InvalidSection3;
goto failed;
} else if (!curlevel) {
err = SMCParse_InvalidSection4;
goto failed;
}
/* Now it's safe to leave the section */
scrap(strings);
if ((res=smc->ReadSMC_LeavingSection()) != SMCParse_Continue)
{
err = (res == SMCParse_HaltFail) ? SMCParse_Custom : SMCParse_Okay;
goto failed;
}
curlevel--;
} else if (c == '"') {
/* If we get a quote mark, we always restage, but we need to do it beforehand */
if (strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCParse_InvalidTokens;
goto failed;
}
}
strings[0].ptr = &parse_point[i];
in_quote = true;
ignoring = true;
} else if (!strings[0].ptr) {
/* If we have no string, we must start one */
strings[0].ptr = &parse_point[i];
}
if (restage && strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCParse_InvalidTokens;
goto failed;
}
}
} else {
/* If we're eating a string and get whitespace, we need to restage.
* (Note that if we are quoted, this is being ignored)
*/
if (strings[0].ptr)
{
/*
* The specification says the second string in a pair does not need to be quoted.
* Thus, we check if there's already a string on the stack.
* If there's a newline, we always rotate so the newline has an empty starter.
*/
if (!strings[1].ptr)
{
/* There's no string, so we must move this one down and eat up another */
strings[0].end = &parse_point[i];
rotate(strings);
} else if (!strings[1].quoted) {
err = SMCParse_InvalidTokens;
goto failed;
}
}
}
}
/* Advance which token we're on */
curtok++;
}
if (line_begin != in_buf)
{
/* The line buffer has advanced, so it's safe to copy N bytes back to the beginning.
* What's N? N is the lowest point we're currently relying on.
*/
char *stage = lowstring(strings);
if (!stage || stage > line_begin)
{
stage = line_begin;
}
unsigned int bytes = read - (stage - parse_point);
/* It is now safe to delete everything before the staged point */
memmove(in_buf, stage, bytes);
/* Calculate the number of bytes in the new buffer */
bytes = stage - in_buf;
/* Relocate all the cached pointers to our new base */
line_begin -= bytes;
reloc(strings[0], bytes);
reloc(strings[1], bytes);
reloc(strings[2], bytes);
if (reparse_point)
{
reparse_point -= bytes;
}
if (parse_point)
{
parse_point = &parse_point[read];
parse_point -= bytes;
}
} else if (read == sizeof(in_buf) - 1) {
err = SMCParse_TokenOverflow;
goto failed;
}
}
/* If we're done parsing and there are tokens left over... */
if (curlevel)
{
err = SMCParse_InvalidSection5;
goto failed;
} else if (strings[0].ptr || strings[1].ptr) {
err = SMCParse_InvalidTokens;
goto failed;
}
smc->ReadSMC_ParseEnd(false, false);
return SMCParse_Okay;
failed:
if (line)
{
*line = curline;
}
smc->ReadSMC_ParseEnd(true, (err == SMCParse_Custom));
if (col)
{
*col = curtok;
}
return err;
}
/**
* INI parser
*/
bool TextParsers::ParseFile_INI(const char *file, ITextListener_INI *ini_listener, unsigned int *line, unsigned int *col)
{
FILE *fp = fopen(file, "rt");
unsigned int curline = 0;
unsigned int curtok;
size_t len;
if (!fp)
{
if (line)
{
*line = 0;
}
return false;
}
char buffer[2048];
char *ptr, *save_ptr;
bool in_quote;
while (!feof(fp))
{
curline++;
curtok = 0;
buffer[0] = '\0';
if (fgets(buffer, sizeof(buffer), fp) == NULL)
{
break;
}
//:TODO: this will only run once, so find a nice way to move it out of the while loop
/* If this is the first line, check the first three bytes for BOM */
if (curline == 1 &&
buffer[0] == (char)0xEF &&
buffer[1] == (char)0xBB &&
buffer[2] == (char)0xBF)
{
/* We have a UTF-8 marked file... skip these bytes */
ptr = &buffer[3];
} else {
ptr = buffer;
}
/***************************************************
* We preprocess the string before parsing tokens! *
***************************************************/
/* First strip beginning whitespace */
while (*ptr != '\0' && g_ws_chartable[(unsigned)*ptr] != 0)
{
ptr++;
}
len = strlen(ptr);
if (!len)
{
continue;
}
/* Now search for comment characters */
in_quote = false;
save_ptr = ptr;
for (size_t i=0; i<len; i++,ptr++)
{
if (!in_quote)
{
switch (*ptr)
{
case '"':
{
in_quote = true;
break;
}
case ';':
{
/* Stop the loop */
len = i;
/* Terminate the string here */
*ptr = '\0';
break;
}
}
} else {
if (*ptr == '"')
{
in_quote = false;
}
}
}
if (!len)
{
continue;
}
ptr = save_ptr;
/* Lastly, strip ending whitespace off */
for (size_t i=len-1; i>=0 && i<len; i--)
{
if (g_ws_chartable[(unsigned)ptr[i]])
{
ptr[i] = '\0';
len--;
} else {
break;
}
}
if (!len)
{
continue;
}
if (!ini_listener->ReadINI_RawLine(ptr, &curtok))
{
goto event_failed;
}
if (*ptr == '[')
{
bool invalid_tokens = false;
bool got_bracket = false;
bool extra_tokens = false;
char c;
bool alnum;
wchar_t wc;
for (size_t i=1; i<len; i++)
{
c = ptr[i];
alnum = false;
if (c & (1<<7))
{
if (mbtowc(&wc, &ptr[i], len-i) != -1)
{
alnum = (iswalnum(wc) != 0);
i += _GetUTF8CharBytes(&ptr[i]) - 1;
}
} else {
alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned)c] != 0);
}
if (!alnum)
{
/* First check - is this a bracket? */
if (c == ']')
{
/* Yes! */
got_bracket = true;
/* If this isn't the last character... */
if (i != len - 1)
{
extra_tokens = true;
}
/* terminate */
ptr[i] = '\0';
break;
} else {
/* n...No! Continue copying. */
invalid_tokens = true;
}
}
}
/* Tell the handler */
if (!ini_listener->ReadINI_NewSection(&ptr[1], invalid_tokens, got_bracket, extra_tokens, &curtok))
{
goto event_failed;
}
} else {
char *key_ptr = ptr;
char *val_ptr = NULL;
char c;
size_t first_space = 0;
bool invalid_tokens = false;
bool equal_token = false;
bool quotes = false;
bool alnum;
wchar_t wc;
for (size_t i=0; i<len; i++)
{
c = ptr[i];
alnum = false;
/* is this an invalid char? */
if (c & (1<<7))
{
if (mbtowc(&wc, &ptr[i], len-i) != -1)
{
alnum = (iswalnum(wc) != 0);
i += _GetUTF8CharBytes(&ptr[i]) - 1;
}
} else {
alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned)c] != 0);
}
if (!alnum)
{
if (g_ws_chartable[(unsigned)c])
{
/* if it's a space, keep track of the first occurring space */
if (!first_space)
{
first_space = i;
}
} else {
if (c == '=')
{
/* if it's an equal sign, we're done with the key */
if (first_space)
{
/* remove excess whitespace */
key_ptr[first_space] = '\0';
} else {
/* remove the equal sign */
key_ptr[i] = '\0';
}
if (ptr[++i] != '\0')
{
/* If this isn't the end, set next pointer */
val_ptr = &ptr[i];
}
equal_token = true;
break;
} else {
/* Mark that we got something invalid! */
invalid_tokens = true;
first_space = 0;
}
}
}
}
/* Now we need to parse the value, if any */
if (val_ptr)
{
/* eat up spaces! there shouldn't be any h*/
while ((*val_ptr != '\0') && g_ws_chartable[(unsigned)*val_ptr] != 0)
{
val_ptr++;
}
if (*val_ptr == '\0')
{
val_ptr = NULL;
goto skip_value;
}
/* Do we have an initial quote? If so, the parsing rules change! */
if (*val_ptr == '"' && *val_ptr != '\0')
{
len = strlen(val_ptr);
if (val_ptr[len-1] == '"')
{
/* Strip quotes! */
val_ptr[--len] = '\0';
val_ptr++;
quotes = true;
}
}
}
skip_value:
/* We're done! */
curtok = val_ptr - buffer;
if (!ini_listener->ReadINI_KeyValue(key_ptr, val_ptr, invalid_tokens, equal_token, quotes, &curtok))
{
curtok = 0;
goto event_failed;
}
}
}
if (line)
{
*line = curline;
}
fclose(fp);
return true;
event_failed:
if (line)
{
*line = curline;
}
if (col)
{
*col = curtok;
}
fclose(fp);
return false;
}
const char *TextParsers::GetSMCErrorString(SMCParseError err)
{
static const char *s_errors[] =
{
NULL,
"Stream failed to open",
"Stream returned read error",
NULL,
"Un-quoted section has invalid tokens",
"Section declared without header",
"Section declared with unknown tokens",
"Section ending without a matching section beginning",
"Section beginning without a matching ending",
"Line contained too many invalid tokens",
"Token buffer overflowed",
"A property was declared outside of a section",
};
if (err < SMCParse_Okay || err > SMCParse_InvalidProperty1)
{
return NULL;
}
return s_errors[err];
}