sourcemod/core/logic/TextParsers.cpp
2015-08-30 18:03:31 -07:00

1099 lines
25 KiB
C++

/**
* vim: set ts=4 sw=4 tw=99 noet :
* =============================================================================
* SourceMod
* Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved.
* =============================================================================
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, version 3.0, as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, AlliedModders LLC gives you permission to link the
* code of this program (as well as its derivative works) to "Half-Life 2," the
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
* by the Valve Corporation. You must obey the GNU General Public License in
* all respects for all other code used. Additionally, AlliedModders LLC grants
* this exception to all derivative works. AlliedModders LLC defines further
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
* or <http://www.sourcemod.net/license.php>.
*
* Version: $Id$
*/
#include <stdio.h>
#include <ctype.h>
#include <wctype.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "TextParsers.h"
#include <ILibrarySys.h>
#include <am-string.h>
TextParsers g_TextParser;
ITextParsers *textparsers = &g_TextParser;
static int g_ini_chartable1[255] = {0};
static int g_ws_chartable[255] = {0};
bool TextParsers::IsWhitespace(const char *stream)
{
return g_ws_chartable[(unsigned char)*stream] == 1;
}
TextParsers::TextParsers()
{
g_ini_chartable1[(unsigned)'_'] = 1;
g_ini_chartable1[(unsigned)'-'] = 1;
g_ini_chartable1[(unsigned)','] = 1;
g_ini_chartable1[(unsigned)'+'] = 1;
g_ini_chartable1[(unsigned)'.'] = 1;
g_ini_chartable1[(unsigned)'$'] = 1;
g_ini_chartable1[(unsigned)'?'] = 1;
g_ini_chartable1[(unsigned)'/'] = 1;
g_ws_chartable[(unsigned)'\n'] = 1;
g_ws_chartable[(unsigned)'\v'] = 1;
g_ws_chartable[(unsigned)'\r'] = 1;
g_ws_chartable[(unsigned)'\t'] = 1;
g_ws_chartable[(unsigned)'\f'] = 1;
g_ws_chartable[(unsigned)' '] = 1;
}
void TextParsers::OnSourceModAllInitialized()
{
sharesys->AddInterface(NULL, this);
}
unsigned int TextParsers::GetUTF8CharBytes(const char *stream)
{
return _GetUTF8CharBytes(stream);
}
/**
* File streams
*/
bool FileStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read)
{
size_t num = fread(buffer, 1, maxlength, (FILE *)stream);
*read = static_cast<unsigned int>(num);
if (num == 0 && feof((FILE *)stream))
{
return true;
}
return (ferror((FILE *)stream) == 0);
}
SMCError TextParsers::ParseFile_SMC(const char *file, ITextListener_SMC *smc, SMCStates *states)
{
FILE *fp = fopen(file, "rt");
if (!fp)
{
if (states != NULL)
{
states->line = 0;
states->col = 0;
}
return SMCError_StreamOpen;
}
SMCError result = ParseStream_SMC(fp, FileStreamReader, smc, states);
fclose(fp);
return result;
}
SMCError TextParsers::ParseSMCFile(const char *file,
ITextListener_SMC *smc_listener,
SMCStates *states,
char *buffer,
size_t maxsize)
{
const char *errstr;
FILE *fp = fopen(file, "rt");
if (fp == NULL)
{
char error[256] = "unknown";
if (states != NULL)
{
states->line = 0;
states->col = 0;
}
libsys->GetPlatformError(error, sizeof(error));
ke::SafeSprintf(buffer, maxsize, "File could not be opened: %s", error);
return SMCError_StreamOpen;
}
SMCError result = ParseStream_SMC(fp, FileStreamReader, smc_listener, states);
fclose(fp);
errstr = GetSMCErrorString(result);
ke::SafeSprintf(buffer, maxsize, "%s", errstr != NULL ? errstr : "Unknown error");
return result;
}
struct RawStream
{
const char *stream;
size_t length;
size_t pos;
};
bool RawStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read)
{
RawStream *rs = (RawStream *)stream;
if (rs->pos >= rs->length)
{
return false;
}
size_t remaining = rs->length - rs->pos;
/* Use the smaller of the two */
size_t copy = (remaining > maxlength) ? maxlength : remaining;
memcpy(buffer, &rs->stream[rs->pos], copy);
rs->pos += copy;
*read = copy;
assert(rs->pos <= rs->length);
return true;
}
SMCError TextParsers::ParseSMCStream(const char *stream,
size_t length,
ITextListener_SMC *smc_listener,
SMCStates *states,
char *buffer,
size_t maxsize)
{
RawStream rs;
SMCError result;
rs.stream = stream;
rs.length = length;
rs.pos = 0;
result = ParseStream_SMC(&rs, RawStreamReader, smc_listener, states);
const char *errstr = GetSMCErrorString(result);
ke::SafeSprintf(buffer, maxsize, "%s", errstr != NULL ? errstr : "Unknown error");
return result;
}
/**
* Raw parsing of streams with helper functions
*/
struct StringInfo
{
StringInfo() : quoted(false), ptr(NULL), end(NULL), special(false) { }
bool quoted;
char *ptr;
char *end;
bool special;
};
const char *FixupString(StringInfo &data)
{
if (!data.ptr)
{
return NULL;
}
if (data.quoted)
{
data.ptr++;
}
#if defined _DEBUG
else {
/* A string will never have beginning whitespace because we ignore it in the stream.
* Furthermore, if there is trailing whitespace, the end ptr will point to it, so it is valid
* to overwrite! Lastly, the last character must be whitespace or a comment/invalid character.
*/
}
#endif
/* Do some extra work on strings that have special quoted characters. */
if (data.special)
{
char *outptr = data.ptr;
size_t len = data.end - data.ptr;
if (len >= 2)
{
for (size_t i=0; i<len; i++)
{
if (data.ptr[i] == '\\' && i < len - 1)
{
/* Resolve the next character. */
i++;
if (data.ptr[i] == 'n')
{
data.ptr[i] = '\n';
} else if (data.ptr[i] == 't') {
data.ptr[i] = '\t';
} else if (data.ptr[i] == 'r') {
data.ptr[i] = '\r';
} else if (data.ptr[i] != '\\'
&& data.ptr[i] != '"') {
/* This character is invalid, so go back one */
i--;
}
}
*outptr++ = data.ptr[i];
}
*outptr = '\0';
}
}
if (data.end)
{
*(data.end) = '\0';
}
return data.ptr;
}
const char *rotate(StringInfo info[3])
{
if (info[2].ptr != NULL)
{
return info[2].ptr;
}
if (info[0].ptr != NULL)
{
info[2] = info[1];
info[1] = info[0];
info[0] = StringInfo();
}
return NULL;
}
void scrap(StringInfo info[3])
{
info[2] = StringInfo();
info[1] = StringInfo();
info[0] = StringInfo();
}
void reloc(StringInfo &data, unsigned int bytes)
{
if (data.ptr)
{
data.ptr -= bytes;
}
if (data.end)
{
data.end -= bytes;
}
}
char *lowstring(StringInfo info[3])
{
for (int i=2; i>=0; i--)
{
if (info[i].ptr)
{
return info[i].ptr;
}
}
return NULL;
}
SMCError TextParsers::ParseStream_SMC(void *stream,
STREAMREADER srdr,
ITextListener_SMC *smc,
SMCStates *pStates)
{
char *reparse_point = NULL;
char in_buf[4096];
char *parse_point = in_buf;
char *line_begin = in_buf;
unsigned int read;
unsigned int curlevel = 0;
bool in_quote = false;
bool ignoring = false;
bool eol_comment = false;
bool ml_comment = false;
unsigned int i;
SMCError err = SMCError_Okay;
SMCResult res;
SMCStates states;
char c;
StringInfo strings[3];
StringInfo emptystring;
states.line = 1;
states.col = 0;
smc->ReadSMC_ParseStart();
/**
* The stream reader reads in as much as it can fill the buffer with.
* It then processes the buffer. If the buffer cannot be fully processed, for example,
* a line is left hanging with no newline, then the contents of the buffer is shifted
* down, and the buffer is filled from the stream reader again.
*
* What makes this particularly annoying is that we cache pointers everywhere, so when
* the shifting process takes place, all those pointers must be shifted as well.
*/
while (srdr(stream, parse_point, sizeof(in_buf) - (parse_point - in_buf) - 1, &read))
{
if (!read)
{
break;
}
/* Check for BOM markings, which is only relevant on the first line.
* Not worth it, but it could be moved out of the loop.
*/
if (states.line == 1 &&
in_buf[0] == (char)0xEF &&
in_buf[1] == (char)0xBB &&
in_buf[2] == (char)0xBF)
{
/* Move EVERYTHING down :\ */
memmove(in_buf, &in_buf[3], read - 3);
read -= 3;
}
if (reparse_point)
{
read += (parse_point - reparse_point);
parse_point = reparse_point;
reparse_point = NULL;
}
for (i=0; i<read; i++)
{
c = parse_point[i];
if (c == '\n')
{
/* If we got a newline, there's a lot of things that could have happened in the interim.
* First, let's make sure the staged strings are rotated.
*/
if (strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCError_InvalidTokens;
goto failed;
}
}
/* Next, let's clear some line-based values that may no longer have meaning */
eol_comment = false;
in_quote = false;
if (ignoring && !ml_comment)
{
ignoring = false;
}
/* Pass the raw line onto the listener. We terminate the line so the receiver
* doesn't get tons of useless info. We restore the newline after.
*/
parse_point[i] = '\0';
if ((res=smc->ReadSMC_RawLine(&states, line_begin)) != SMCResult_Continue)
{
err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay;
goto failed;
}
parse_point[i] = '\n';
/* Now we check the sanity of our staged strings! */
if (strings[2].ptr)
{
if (!curlevel)
{
err = SMCError_InvalidProperty1;
goto failed;
}
/* Assume the next string is a property and pass the info on. */
if ((res=smc->ReadSMC_KeyValue(
&states,
FixupString(strings[2]),
FixupString(strings[1]))) != SMCResult_Continue)
{
err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay;
goto failed;
}
scrap(strings);
}
/* Change the states for the next line */
states.col = 0;
states.line++;
line_begin = &parse_point[i+1]; //Note: safe because this gets relocated later
}
else if (ignoring)
{
if (in_quote)
{
/* If i was 0, we could have reparsed, so make sure there's no buffer underrun */
if ((&parse_point[i] != in_buf) && c == '"' && parse_point[i-1] != '\\')
{
/* If we reached a quote in an ignore phase,
* we're staging a string and we must rotate it out.
*/
in_quote = false;
ignoring = false;
/* Set our info */
strings[0].end = &parse_point[i];
strings[0].quoted = true;
if (rotate(strings) != NULL)
{
/* If we rotated too many strings, there was too much crap on one line */
err = SMCError_InvalidTokens;
goto failed;
}
}
else if (c == '\\')
{
strings[0].special = true;
if (i == (read - 1))
{
reparse_point = &parse_point[i];
break;
}
}
}
else if (ml_comment)
{
if (c == '*')
{
/* Check if we need to get more input first */
if (i == read - 1)
{
reparse_point = &parse_point[i];
break;
}
if (parse_point[i+1] == '/')
{
ml_comment = false;
ignoring = false;
/* We should not be staging anything right now. */
assert(strings[0].ptr == NULL);
/* Advance the input stream so we don't choke on this token */
i++;
states.col++;
}
}
}
}
else
{
/* Check if we're whitespace or not */
if (!g_ws_chartable[(unsigned char)c])
{
bool restage = false;
/* Check various special tokens:
* ;
* //
* / *
* {
* }
*/
if (c == ';' || c == '/')
{
/* If it's a line-based comment (that is, ; or //)
* we will need to scrap everything until the end of the line.
*/
if (c == '/')
{
if (i == read - 1)
{
/* If we reached the end of the look-ahead, we need to re-check our input.
* Breaking out will force this to be the new reparse point!
*/
reparse_point = &parse_point[i];
break;
}
if (parse_point[i + 1] == '/')
{
/* standard comment */
ignoring = true;
eol_comment = true;
restage = true;
}
else if (parse_point[i+1] == '*')
{
/* inline comment - start ignoring */
ignoring = true;
ml_comment = true;
/* yes, we restage, meaning that:
* STR/ *stuff* /ING (space because ml comments don't nest in C++)
* will not generate 'STRING', but rather 'STR' and 'ING'.
* This should be a rare occurrence and is done here for convenience.
*/
restage = true;
}
}
else
{
ignoring = true;
eol_comment = true;
restage = true;
}
}
else if (c == '{')
{
/* If we are staging a string, we must rotate here */
if (strings[0].ptr)
{
/* We have unacceptable tokens on this line */
if (rotate(strings) != NULL)
{
err = SMCError_InvalidSection1;
goto failed;
}
}
/* Sections must always be alone */
if (strings[2].ptr != NULL)
{
err = SMCError_InvalidSection1;
goto failed;
}
else if (strings[1].ptr == NULL)
{
err = SMCError_InvalidSection2;
goto failed;
}
if ((res=smc->ReadSMC_NewSection(&states, FixupString(strings[1])))
!= SMCResult_Continue)
{
err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay;
goto failed;
}
strings[1] = emptystring;
curlevel++;
}
else if (c == '}')
{
/* Unlike our matching friend, this can be on the same line as something prior */
if (rotate(strings) != NULL)
{
err = SMCError_InvalidSection3;
goto failed;
}
if (strings[2].ptr)
{
if (!curlevel)
{
err = SMCError_InvalidProperty1;
goto failed;
}
if ((res=smc->ReadSMC_KeyValue(
&states,
FixupString(strings[2]),
FixupString(strings[1])))
!= SMCResult_Continue)
{
err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay;
goto failed;
}
}
else if (strings[1].ptr)
{
err = SMCError_InvalidSection3;
goto failed;
}
else if (!curlevel)
{
err = SMCError_InvalidSection4;
goto failed;
}
/* Now it's safe to leave the section */
scrap(strings);
if ((res=smc->ReadSMC_LeavingSection(&states)) != SMCResult_Continue)
{
err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay;
goto failed;
}
curlevel--;
}
else if (c == '"')
{
/* If we get a quote mark, we always restage, but we need to do it beforehand */
if (strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCError_InvalidTokens;
goto failed;
}
}
strings[0].ptr = &parse_point[i];
in_quote = true;
ignoring = true;
}
else if (!strings[0].ptr)
{
/* If we have no string, we must start one */
strings[0].ptr = &parse_point[i];
}
if (restage && strings[0].ptr)
{
strings[0].end = &parse_point[i];
if (rotate(strings) != NULL)
{
err = SMCError_InvalidTokens;
goto failed;
}
}
}
else
{
/* If we're eating a string and get whitespace, we need to restage.
* (Note that if we are quoted, this is being ignored)
*/
if (strings[0].ptr)
{
/*
* The specification says the second string in a pair does not need to be quoted.
* Thus, we check if there's already a string on the stack.
* If there's a newline, we always rotate so the newline has an empty starter.
*/
if (!strings[1].ptr)
{
/* There's no string, so we must move this one down and eat up another */
strings[0].end = &parse_point[i];
rotate(strings);
}
else if (!strings[1].quoted)
{
err = SMCError_InvalidTokens;
goto failed;
}
}
}
}
/* Advance which token we're on */
states.col++;
}
if (line_begin != in_buf)
{
/* The line buffer has advanced, so it's safe to copy N bytes back to the beginning.
* What's N? N is the lowest point we're currently relying on.
*/
char *stage = lowstring(strings);
if (!stage || stage > line_begin)
{
stage = line_begin;
}
unsigned int bytes = read - (stage - parse_point);
/* It is now safe to delete everything before the staged point */
memmove(in_buf, stage, bytes);
/* Calculate the number of bytes in the new buffer */
bytes = stage - in_buf;
/* Relocate all the cached pointers to our new base */
line_begin -= bytes;
reloc(strings[0], bytes);
reloc(strings[1], bytes);
reloc(strings[2], bytes);
if (reparse_point)
{
reparse_point -= bytes;
}
if (parse_point)
{
parse_point = &parse_point[read];
parse_point -= bytes;
}
}
else if (read == sizeof(in_buf) - 1)
{
err = SMCError_TokenOverflow;
goto failed;
}
}
/* If we're done parsing and there are tokens left over... */
if (curlevel)
{
err = SMCError_InvalidSection5;
goto failed;
}
else if (strings[0].ptr || strings[1].ptr)
{
err = SMCError_InvalidTokens;
goto failed;
}
smc->ReadSMC_ParseEnd(false, false);
if (pStates != NULL)
{
*pStates = states;
}
return SMCError_Okay;
failed:
if (pStates != NULL)
{
*pStates = states;
}
smc->ReadSMC_ParseEnd(true, (err == SMCError_Custom));
return err;
}
/**
* INI parser
*/
bool TextParsers::ParseFile_INI(const char *file, ITextListener_INI *ini_listener, unsigned int *line, unsigned int *col)
{
FILE *fp = fopen(file, "rt");
unsigned int curline = 0;
unsigned int curtok;
size_t len;
if (!fp)
{
if (line)
{
*line = 0;
}
return false;
}
char buffer[2048];
char *ptr, *save_ptr;
bool in_quote;
while (!feof(fp))
{
curline++;
curtok = 0;
buffer[0] = '\0';
if (fgets(buffer, sizeof(buffer), fp) == NULL)
{
break;
}
//:TODO: this will only run once, so find a nice way to move it out of the while loop
/* If this is the first line, check the first three bytes for BOM */
if (curline == 1 &&
buffer[0] == (char)0xEF &&
buffer[1] == (char)0xBB &&
buffer[2] == (char)0xBF)
{
/* We have a UTF-8 marked file... skip these bytes */
ptr = &buffer[3];
} else {
ptr = buffer;
}
/***************************************************
* We preprocess the string before parsing tokens! *
***************************************************/
/* First strip beginning whitespace */
while (*ptr != '\0' && g_ws_chartable[(unsigned char)*ptr] != 0)
{
ptr++;
}
len = strlen(ptr);
if (!len)
{
continue;
}
/* Now search for comment characters */
in_quote = false;
save_ptr = ptr;
for (size_t i=0; i<len; i++,ptr++)
{
if (!in_quote)
{
switch (*ptr)
{
case '"':
{
in_quote = true;
break;
}
case ';':
{
/* Stop the loop */
len = i;
/* Terminate the string here */
*ptr = '\0';
break;
}
}
} else {
if (*ptr == '"')
{
in_quote = false;
}
}
}
if (!len)
{
continue;
}
ptr = save_ptr;
/* Lastly, strip ending whitespace off */
for (size_t i=len-1; i<len; i--)
{
if (g_ws_chartable[(unsigned char)ptr[i]])
{
ptr[i] = '\0';
len--;
} else {
break;
}
}
if (!len)
{
continue;
}
if (!ini_listener->ReadINI_RawLine(ptr, &curtok))
{
goto event_failed;
}
if (*ptr == '[')
{
bool invalid_tokens = false;
bool got_bracket = false;
bool extra_tokens = false;
char c;
bool alnum;
wchar_t wc;
for (size_t i=1; i<len; i++)
{
c = ptr[i];
alnum = false;
if (c & (1<<7))
{
if (mbtowc(&wc, &ptr[i], len-i) != -1)
{
alnum = (iswalnum(wc) != 0);
i += _GetUTF8CharBytes(&ptr[i]) - 1;
}
} else {
alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned char)c] != 0);
}
if (!alnum)
{
/* First check - is this a bracket? */
if (c == ']')
{
/* Yes! */
got_bracket = true;
/* If this isn't the last character... */
if (i != len - 1)
{
extra_tokens = true;
}
/* terminate */
ptr[i] = '\0';
break;
} else {
/* n...No! Continue copying. */
invalid_tokens = true;
}
}
}
/* Tell the handler */
if (!ini_listener->ReadINI_NewSection(&ptr[1], invalid_tokens, got_bracket, extra_tokens, &curtok))
{
goto event_failed;
}
} else {
char *key_ptr = ptr;
char *val_ptr = NULL;
char c;
size_t first_space = 0;
bool invalid_tokens = false;
bool equal_token = false;
bool quotes = false;
bool alnum;
wchar_t wc;
for (size_t i=0; i<len; i++)
{
c = ptr[i];
alnum = false;
/* is this an invalid char? */
if (c & (1<<7))
{
if (mbtowc(&wc, &ptr[i], len-i) != -1)
{
alnum = (iswalnum(wc) != 0);
i += _GetUTF8CharBytes(&ptr[i]) - 1;
}
} else {
alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned char)c] != 0);
}
if (!alnum)
{
if (g_ws_chartable[(unsigned char)c])
{
/* if it's a space, keep track of the first occurring space */
if (!first_space)
{
first_space = i;
}
} else {
if (c == '=')
{
/* if it's an equal sign, we're done with the key */
if (first_space)
{
/* remove excess whitespace */
key_ptr[first_space] = '\0';
} else {
/* remove the equal sign */
key_ptr[i] = '\0';
}
if (ptr[++i] != '\0')
{
/* If this isn't the end, set next pointer */
val_ptr = &ptr[i];
}
equal_token = true;
break;
} else {
/* Mark that we got something invalid! */
invalid_tokens = true;
first_space = 0;
}
}
}
}
/* Now we need to parse the value, if any */
if (val_ptr)
{
/* eat up spaces! there shouldn't be any h*/
while ((*val_ptr != '\0') && g_ws_chartable[(unsigned char)*val_ptr] != 0)
{
val_ptr++;
}
if (*val_ptr == '\0')
{
val_ptr = NULL;
goto skip_value;
}
/* Do we have an initial quote? If so, the parsing rules change! */
if (*val_ptr == '"' && *val_ptr != '\0')
{
len = strlen(val_ptr);
if (val_ptr[len-1] == '"')
{
/* Strip quotes! */
val_ptr[--len] = '\0';
val_ptr++;
quotes = true;
}
}
}
skip_value:
/* We're done! */
curtok = val_ptr - buffer;
if (!ini_listener->ReadINI_KeyValue(key_ptr, val_ptr, invalid_tokens, equal_token, quotes, &curtok))
{
curtok = 0;
goto event_failed;
}
}
}
if (line)
{
*line = curline;
}
fclose(fp);
return true;
event_failed:
if (line)
{
*line = curline;
}
if (col)
{
*col = curtok;
}
fclose(fp);
return false;
}
const char *TextParsers::GetSMCErrorString(SMCError err)
{
static const char *s_errors[] =
{
NULL,
"Stream failed to open",
"Stream returned read error",
"Callback error",
"Un-quoted section has invalid tokens",
"Section declared without header",
"Section declared with unknown tokens",
"Section ending without a matching section beginning",
"Section beginning without a matching ending",
"Line contained too many invalid tokens",
"Token buffer overflowed",
"A property was declared outside of a section",
};
if (err < SMCError_Okay || err > SMCError_InvalidProperty1)
{
return NULL;
}
return s_errors[err];
}