/** * vim: set ts=4 sw=4 tw=99 noet : * ============================================================================= * SourceMod * Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved. * ============================================================================= * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License, version 3.0, as published by the * Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * * As a special exception, AlliedModders LLC gives you permission to link the * code of this program (as well as its derivative works) to "Half-Life 2," the * "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software * by the Valve Corporation. You must obey the GNU General Public License in * all respects for all other code used. Additionally, AlliedModders LLC grants * this exception to all derivative works. AlliedModders LLC defines further * exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007), * or <http://www.sourcemod.net/license.php>. * * Version: $Id$ */ #include <stdio.h> #include <ctype.h> #include <wctype.h> #include <string.h> #include <stdlib.h> #include <assert.h> #include "TextParsers.h" #include <ILibrarySys.h> #include <am-string.h> TextParsers g_TextParser; ITextParsers *textparsers = &g_TextParser; static int g_ini_chartable1[255] = {0}; static int g_ws_chartable[255] = {0}; bool TextParsers::IsWhitespace(const char *stream) { return g_ws_chartable[(unsigned char)*stream] == 1; } TextParsers::TextParsers() { g_ini_chartable1[(unsigned)'_'] = 1; g_ini_chartable1[(unsigned)'-'] = 1; g_ini_chartable1[(unsigned)','] = 1; g_ini_chartable1[(unsigned)'+'] = 1; g_ini_chartable1[(unsigned)'.'] = 1; g_ini_chartable1[(unsigned)'$'] = 1; g_ini_chartable1[(unsigned)'?'] = 1; g_ini_chartable1[(unsigned)'/'] = 1; g_ws_chartable[(unsigned)'\n'] = 1; g_ws_chartable[(unsigned)'\v'] = 1; g_ws_chartable[(unsigned)'\r'] = 1; g_ws_chartable[(unsigned)'\t'] = 1; g_ws_chartable[(unsigned)'\f'] = 1; g_ws_chartable[(unsigned)' '] = 1; } void TextParsers::OnSourceModAllInitialized() { sharesys->AddInterface(NULL, this); } unsigned int TextParsers::GetUTF8CharBytes(const char *stream) { return _GetUTF8CharBytes(stream); } /** * File streams */ bool FileStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read) { size_t num = fread(buffer, 1, maxlength, (FILE *)stream); *read = static_cast<unsigned int>(num); if (num == 0 && feof((FILE *)stream)) { return true; } return (ferror((FILE *)stream) == 0); } SMCError TextParsers::ParseFile_SMC(const char *file, ITextListener_SMC *smc, SMCStates *states) { FILE *fp = fopen(file, "rt"); if (!fp) { if (states != NULL) { states->line = 0; states->col = 0; } return SMCError_StreamOpen; } SMCError result = ParseStream_SMC(fp, FileStreamReader, smc, states); fclose(fp); return result; } SMCError TextParsers::ParseSMCFile(const char *file, ITextListener_SMC *smc_listener, SMCStates *states, char *buffer, size_t maxsize) { const char *errstr; FILE *fp = fopen(file, "rt"); if (fp == NULL) { char error[256] = "unknown"; if (states != NULL) { states->line = 0; states->col = 0; } libsys->GetPlatformError(error, sizeof(error)); ke::SafeSprintf(buffer, maxsize, "File could not be opened: %s", error); return SMCError_StreamOpen; } SMCError result = ParseStream_SMC(fp, FileStreamReader, smc_listener, states); fclose(fp); errstr = GetSMCErrorString(result); ke::SafeSprintf(buffer, maxsize, "%s", errstr != NULL ? errstr : "Unknown error"); return result; } struct RawStream { const char *stream; size_t length; size_t pos; }; bool RawStreamReader(void *stream, char *buffer, size_t maxlength, unsigned int *read) { RawStream *rs = (RawStream *)stream; if (rs->pos >= rs->length) { return false; } size_t remaining = rs->length - rs->pos; /* Use the smaller of the two */ size_t copy = (remaining > maxlength) ? maxlength : remaining; memcpy(buffer, &rs->stream[rs->pos], copy); rs->pos += copy; *read = copy; assert(rs->pos <= rs->length); return true; } SMCError TextParsers::ParseSMCStream(const char *stream, size_t length, ITextListener_SMC *smc_listener, SMCStates *states, char *buffer, size_t maxsize) { RawStream rs; SMCError result; rs.stream = stream; rs.length = length; rs.pos = 0; result = ParseStream_SMC(&rs, RawStreamReader, smc_listener, states); const char *errstr = GetSMCErrorString(result); ke::SafeSprintf(buffer, maxsize, "%s", errstr != NULL ? errstr : "Unknown error"); return result; } /** * Raw parsing of streams with helper functions */ struct StringInfo { StringInfo() : quoted(false), ptr(NULL), end(NULL), special(false) { } bool quoted; char *ptr; char *end; bool special; }; const char *FixupString(StringInfo &data) { if (!data.ptr) { return NULL; } if (data.quoted) { data.ptr++; } #if defined _DEBUG else { /* A string will never have beginning whitespace because we ignore it in the stream. * Furthermore, if there is trailing whitespace, the end ptr will point to it, so it is valid * to overwrite! Lastly, the last character must be whitespace or a comment/invalid character. */ } #endif /* Do some extra work on strings that have special quoted characters. */ if (data.special) { char *outptr = data.ptr; size_t len = data.end - data.ptr; if (len >= 2) { for (size_t i=0; i<len; i++) { if (data.ptr[i] == '\\' && i < len - 1) { /* Resolve the next character. */ i++; if (data.ptr[i] == 'n') { data.ptr[i] = '\n'; } else if (data.ptr[i] == 't') { data.ptr[i] = '\t'; } else if (data.ptr[i] == 'r') { data.ptr[i] = '\r'; } else if (data.ptr[i] != '\\' && data.ptr[i] != '"') { /* This character is invalid, so go back one */ i--; } } *outptr++ = data.ptr[i]; } *outptr = '\0'; } } if (data.end) { *(data.end) = '\0'; } return data.ptr; } const char *rotate(StringInfo info[3]) { if (info[2].ptr != NULL) { return info[2].ptr; } if (info[0].ptr != NULL) { info[2] = info[1]; info[1] = info[0]; info[0] = StringInfo(); } return NULL; } void scrap(StringInfo info[3]) { info[2] = StringInfo(); info[1] = StringInfo(); info[0] = StringInfo(); } void reloc(StringInfo &data, unsigned int bytes) { if (data.ptr) { data.ptr -= bytes; } if (data.end) { data.end -= bytes; } } char *lowstring(StringInfo info[3]) { for (int i=2; i>=0; i--) { if (info[i].ptr) { return info[i].ptr; } } return NULL; } SMCError TextParsers::ParseStream_SMC(void *stream, STREAMREADER srdr, ITextListener_SMC *smc, SMCStates *pStates) { char *reparse_point = NULL; char in_buf[4096]; char *parse_point = in_buf; char *line_begin = in_buf; unsigned int read; unsigned int curlevel = 0; bool in_quote = false; bool ignoring = false; bool eol_comment = false; bool ml_comment = false; unsigned int i; SMCError err = SMCError_Okay; SMCResult res; SMCStates states; char c; StringInfo strings[3]; StringInfo emptystring; states.line = 1; states.col = 0; smc->ReadSMC_ParseStart(); /** * The stream reader reads in as much as it can fill the buffer with. * It then processes the buffer. If the buffer cannot be fully processed, for example, * a line is left hanging with no newline, then the contents of the buffer is shifted * down, and the buffer is filled from the stream reader again. * * What makes this particularly annoying is that we cache pointers everywhere, so when * the shifting process takes place, all those pointers must be shifted as well. */ while (srdr(stream, parse_point, sizeof(in_buf) - (parse_point - in_buf) - 1, &read)) { if (!read) { break; } /* Check for BOM markings, which is only relevant on the first line. * Not worth it, but it could be moved out of the loop. */ if (states.line == 1 && in_buf[0] == (char)0xEF && in_buf[1] == (char)0xBB && in_buf[2] == (char)0xBF) { /* Move EVERYTHING down :\ */ memmove(in_buf, &in_buf[3], read - 3); read -= 3; } if (reparse_point) { read += (parse_point - reparse_point); parse_point = reparse_point; reparse_point = NULL; } for (i=0; i<read; i++) { c = parse_point[i]; if (c == '\n') { /* If we got a newline, there's a lot of things that could have happened in the interim. * First, let's make sure the staged strings are rotated. */ if (strings[0].ptr) { strings[0].end = &parse_point[i]; if (rotate(strings) != NULL) { err = SMCError_InvalidTokens; goto failed; } } /* Next, let's clear some line-based values that may no longer have meaning */ eol_comment = false; in_quote = false; if (ignoring && !ml_comment) { ignoring = false; } /* Pass the raw line onto the listener. We terminate the line so the receiver * doesn't get tons of useless info. We restore the newline after. */ parse_point[i] = '\0'; if ((res=smc->ReadSMC_RawLine(&states, line_begin)) != SMCResult_Continue) { err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay; goto failed; } parse_point[i] = '\n'; /* Now we check the sanity of our staged strings! */ if (strings[2].ptr) { if (!curlevel) { err = SMCError_InvalidProperty1; goto failed; } /* Assume the next string is a property and pass the info on. */ if ((res=smc->ReadSMC_KeyValue( &states, FixupString(strings[2]), FixupString(strings[1]))) != SMCResult_Continue) { err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay; goto failed; } scrap(strings); } /* Change the states for the next line */ states.col = 0; states.line++; line_begin = &parse_point[i+1]; //Note: safe because this gets relocated later } else if (ignoring) { if (in_quote) { /* If i was 0, we could have reparsed, so make sure there's no buffer underrun */ if ((&parse_point[i] != in_buf) && c == '"' && parse_point[i-1] != '\\') { /* If we reached a quote in an ignore phase, * we're staging a string and we must rotate it out. */ in_quote = false; ignoring = false; /* Set our info */ strings[0].end = &parse_point[i]; strings[0].quoted = true; if (rotate(strings) != NULL) { /* If we rotated too many strings, there was too much crap on one line */ err = SMCError_InvalidTokens; goto failed; } } else if (c == '\\') { strings[0].special = true; if (i == (read - 1)) { reparse_point = &parse_point[i]; break; } } } else if (ml_comment) { if (c == '*') { /* Check if we need to get more input first */ if (i == read - 1) { reparse_point = &parse_point[i]; break; } if (parse_point[i+1] == '/') { ml_comment = false; ignoring = false; /* We should not be staging anything right now. */ assert(strings[0].ptr == NULL); /* Advance the input stream so we don't choke on this token */ i++; states.col++; } } } } else { /* Check if we're whitespace or not */ if (!g_ws_chartable[(unsigned char)c]) { bool restage = false; /* Check various special tokens: * ; * // * / * * { * } */ if (c == ';' || c == '/') { /* If it's a line-based comment (that is, ; or //) * we will need to scrap everything until the end of the line. */ if (c == '/') { if (i == read - 1) { /* If we reached the end of the look-ahead, we need to re-check our input. * Breaking out will force this to be the new reparse point! */ reparse_point = &parse_point[i]; break; } if (parse_point[i + 1] == '/') { /* standard comment */ ignoring = true; eol_comment = true; restage = true; } else if (parse_point[i+1] == '*') { /* inline comment - start ignoring */ ignoring = true; ml_comment = true; /* yes, we restage, meaning that: * STR/ *stuff* /ING (space because ml comments don't nest in C++) * will not generate 'STRING', but rather 'STR' and 'ING'. * This should be a rare occurrence and is done here for convenience. */ restage = true; } } else { ignoring = true; eol_comment = true; restage = true; } } else if (c == '{') { /* If we are staging a string, we must rotate here */ if (strings[0].ptr) { /* We have unacceptable tokens on this line */ if (rotate(strings) != NULL) { err = SMCError_InvalidSection1; goto failed; } } /* Sections must always be alone */ if (strings[2].ptr != NULL) { err = SMCError_InvalidSection1; goto failed; } else if (strings[1].ptr == NULL) { err = SMCError_InvalidSection2; goto failed; } if ((res=smc->ReadSMC_NewSection(&states, FixupString(strings[1]))) != SMCResult_Continue) { err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay; goto failed; } strings[1] = emptystring; curlevel++; } else if (c == '}') { /* Unlike our matching friend, this can be on the same line as something prior */ if (rotate(strings) != NULL) { err = SMCError_InvalidSection3; goto failed; } if (strings[2].ptr) { if (!curlevel) { err = SMCError_InvalidProperty1; goto failed; } if ((res=smc->ReadSMC_KeyValue( &states, FixupString(strings[2]), FixupString(strings[1]))) != SMCResult_Continue) { err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay; goto failed; } } else if (strings[1].ptr) { err = SMCError_InvalidSection3; goto failed; } else if (!curlevel) { err = SMCError_InvalidSection4; goto failed; } /* Now it's safe to leave the section */ scrap(strings); if ((res=smc->ReadSMC_LeavingSection(&states)) != SMCResult_Continue) { err = (res == SMCResult_HaltFail) ? SMCError_Custom : SMCError_Okay; goto failed; } curlevel--; } else if (c == '"') { /* If we get a quote mark, we always restage, but we need to do it beforehand */ if (strings[0].ptr) { strings[0].end = &parse_point[i]; if (rotate(strings) != NULL) { err = SMCError_InvalidTokens; goto failed; } } strings[0].ptr = &parse_point[i]; in_quote = true; ignoring = true; } else if (!strings[0].ptr) { /* If we have no string, we must start one */ strings[0].ptr = &parse_point[i]; } if (restage && strings[0].ptr) { strings[0].end = &parse_point[i]; if (rotate(strings) != NULL) { err = SMCError_InvalidTokens; goto failed; } } } else { /* If we're eating a string and get whitespace, we need to restage. * (Note that if we are quoted, this is being ignored) */ if (strings[0].ptr) { /* * The specification says the second string in a pair does not need to be quoted. * Thus, we check if there's already a string on the stack. * If there's a newline, we always rotate so the newline has an empty starter. */ if (!strings[1].ptr) { /* There's no string, so we must move this one down and eat up another */ strings[0].end = &parse_point[i]; rotate(strings); } else if (!strings[1].quoted) { err = SMCError_InvalidTokens; goto failed; } } } } /* Advance which token we're on */ states.col++; } if (line_begin != in_buf) { /* The line buffer has advanced, so it's safe to copy N bytes back to the beginning. * What's N? N is the lowest point we're currently relying on. */ char *stage = lowstring(strings); if (!stage || stage > line_begin) { stage = line_begin; } unsigned int bytes = read - (stage - parse_point); /* It is now safe to delete everything before the staged point */ memmove(in_buf, stage, bytes); /* Calculate the number of bytes in the new buffer */ bytes = stage - in_buf; /* Relocate all the cached pointers to our new base */ line_begin -= bytes; reloc(strings[0], bytes); reloc(strings[1], bytes); reloc(strings[2], bytes); if (reparse_point) { reparse_point -= bytes; } if (parse_point) { parse_point = &parse_point[read]; parse_point -= bytes; } } else if (read == sizeof(in_buf) - 1) { err = SMCError_TokenOverflow; goto failed; } } /* If we're done parsing and there are tokens left over... */ if (curlevel) { err = SMCError_InvalidSection5; goto failed; } else if (strings[0].ptr || strings[1].ptr) { err = SMCError_InvalidTokens; goto failed; } smc->ReadSMC_ParseEnd(false, false); if (pStates != NULL) { *pStates = states; } return SMCError_Okay; failed: if (pStates != NULL) { *pStates = states; } smc->ReadSMC_ParseEnd(true, (err == SMCError_Custom)); return err; } /** * INI parser */ bool TextParsers::ParseFile_INI(const char *file, ITextListener_INI *ini_listener, unsigned int *line, unsigned int *col) { FILE *fp = fopen(file, "rt"); unsigned int curline = 0; unsigned int curtok; size_t len; if (!fp) { if (line) { *line = 0; } return false; } char buffer[2048]; char *ptr, *save_ptr; bool in_quote; while (!feof(fp)) { curline++; curtok = 0; buffer[0] = '\0'; if (fgets(buffer, sizeof(buffer), fp) == NULL) { break; } //:TODO: this will only run once, so find a nice way to move it out of the while loop /* If this is the first line, check the first three bytes for BOM */ if (curline == 1 && buffer[0] == (char)0xEF && buffer[1] == (char)0xBB && buffer[2] == (char)0xBF) { /* We have a UTF-8 marked file... skip these bytes */ ptr = &buffer[3]; } else { ptr = buffer; } /*************************************************** * We preprocess the string before parsing tokens! * ***************************************************/ /* First strip beginning whitespace */ while (*ptr != '\0' && g_ws_chartable[(unsigned char)*ptr] != 0) { ptr++; } len = strlen(ptr); if (!len) { continue; } /* Now search for comment characters */ in_quote = false; save_ptr = ptr; for (size_t i=0; i<len; i++,ptr++) { if (!in_quote) { switch (*ptr) { case '"': { in_quote = true; break; } case ';': { /* Stop the loop */ len = i; /* Terminate the string here */ *ptr = '\0'; break; } } } else { if (*ptr == '"') { in_quote = false; } } } if (!len) { continue; } ptr = save_ptr; /* Lastly, strip ending whitespace off */ for (size_t i=len-1; i<len; i--) { if (g_ws_chartable[(unsigned char)ptr[i]]) { ptr[i] = '\0'; len--; } else { break; } } if (!len) { continue; } if (!ini_listener->ReadINI_RawLine(ptr, &curtok)) { goto event_failed; } if (*ptr == '[') { bool invalid_tokens = false; bool got_bracket = false; bool extra_tokens = false; char c; bool alnum; wchar_t wc; for (size_t i=1; i<len; i++) { c = ptr[i]; alnum = false; if (c & (1<<7)) { if (mbtowc(&wc, &ptr[i], len-i) != -1) { alnum = (iswalnum(wc) != 0); i += _GetUTF8CharBytes(&ptr[i]) - 1; } } else { alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned char)c] != 0); } if (!alnum) { /* First check - is this a bracket? */ if (c == ']') { /* Yes! */ got_bracket = true; /* If this isn't the last character... */ if (i != len - 1) { extra_tokens = true; } /* terminate */ ptr[i] = '\0'; break; } else { /* n...No! Continue copying. */ invalid_tokens = true; } } } /* Tell the handler */ if (!ini_listener->ReadINI_NewSection(&ptr[1], invalid_tokens, got_bracket, extra_tokens, &curtok)) { goto event_failed; } } else { char *key_ptr = ptr; char *val_ptr = NULL; char c; size_t first_space = 0; bool invalid_tokens = false; bool equal_token = false; bool quotes = false; bool alnum; wchar_t wc; for (size_t i=0; i<len; i++) { c = ptr[i]; alnum = false; /* is this an invalid char? */ if (c & (1<<7)) { if (mbtowc(&wc, &ptr[i], len-i) != -1) { alnum = (iswalnum(wc) != 0); i += _GetUTF8CharBytes(&ptr[i]) - 1; } } else { alnum = (isalnum(c) != 0) || (g_ini_chartable1[(unsigned char)c] != 0); } if (!alnum) { if (g_ws_chartable[(unsigned char)c]) { /* if it's a space, keep track of the first occurring space */ if (!first_space) { first_space = i; } } else { if (c == '=') { /* if it's an equal sign, we're done with the key */ if (first_space) { /* remove excess whitespace */ key_ptr[first_space] = '\0'; } else { /* remove the equal sign */ key_ptr[i] = '\0'; } if (ptr[++i] != '\0') { /* If this isn't the end, set next pointer */ val_ptr = &ptr[i]; } equal_token = true; break; } else { /* Mark that we got something invalid! */ invalid_tokens = true; first_space = 0; } } } } /* Now we need to parse the value, if any */ if (val_ptr) { /* eat up spaces! there shouldn't be any h*/ while ((*val_ptr != '\0') && g_ws_chartable[(unsigned char)*val_ptr] != 0) { val_ptr++; } if (*val_ptr == '\0') { val_ptr = NULL; goto skip_value; } /* Do we have an initial quote? If so, the parsing rules change! */ if (*val_ptr == '"' && *val_ptr != '\0') { len = strlen(val_ptr); if (val_ptr[len-1] == '"') { /* Strip quotes! */ val_ptr[--len] = '\0'; val_ptr++; quotes = true; } } } skip_value: /* We're done! */ curtok = val_ptr - buffer; if (!ini_listener->ReadINI_KeyValue(key_ptr, val_ptr, invalid_tokens, equal_token, quotes, &curtok)) { curtok = 0; goto event_failed; } } } if (line) { *line = curline; } fclose(fp); return true; event_failed: if (line) { *line = curline; } if (col) { *col = curtok; } fclose(fp); return false; } const char *TextParsers::GetSMCErrorString(SMCError err) { static const char *s_errors[] = { NULL, "Stream failed to open", "Stream returned read error", "Callback error", "Un-quoted section has invalid tokens", "Section declared without header", "Section declared with unknown tokens", "Section ending without a matching section beginning", "Section beginning without a matching ending", "Line contained too many invalid tokens", "Token buffer overflowed", "A property was declared outside of a section", }; if (err < SMCError_Okay || err > SMCError_InvalidProperty1) { return NULL; } return s_errors[err]; }