Backported the changes CompuPhase did to the compiler to support string literal concatenation including all fixes in later commits from r30 on. http://code.google.com/p/pawnscript/source/detail?r=30 Pawn uses ellipses "..." to concatenate so it looks like this: #define PROJECT_AUTHOR "Greyscale" #define PROJECT_COPYRIGHT "Copyright (C) 2010 " ... PROJECT_AUTHOR This would result in PROJECT_COPYRIGHT being defined as "Copyright (C) 2010 Greyscale" While i've been at it, that stringizing a macro parameter feature was ported too. From the changelog for version 3.3.4026 (http://www.compuphase.com/pawn/pawnhistory.htm): The macro substition processor now recognizes the "#" character for "stringizing" a parameter. For example, if you have the definition #define log(%1) #%1 Then the expression log(test) will result in "test". Note that concatenation of literal strings requires an ellipsis in pawn (which is different than C/C++). So to combine the parameter with literal strings, use a syntax like: #define log(%1) "logging: " ... #%1 ... "\n" The stringize operator is only available in the replacement text of a macro. Doing PrintToServer(log(hello)); would print logging: hello\n
		
			
				
	
	
		
			430 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			430 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*  Codepage translation to Unicode, and UTF-8 support
 | 
						|
 *
 | 
						|
 *  The translation is based on codepage mapping files that are distributed
 | 
						|
 *  by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
 | 
						|
 *
 | 
						|
 *  Character sets with a maximum of 256 codes are translated via a lookup
 | 
						|
 *  table (these are Single-Byte Character Sets). Character sets like Shift-JIS
 | 
						|
 *  with single-byte characters and multi-byte characters (introduced by a
 | 
						|
 *  leader byte) are split into two tables: the 256-entry lookup table for
 | 
						|
 *  the single-byte characters and an extended table for the multi-byte
 | 
						|
 *  characters. The extended table is allocated dynamically; the lookup table
 | 
						|
 *  is allocated statically, so loading SBCS tables cannot fail (if the tables
 | 
						|
 *  themselves are valid, of course).
 | 
						|
 *
 | 
						|
 *  Copyright (c) ITB CompuPhase, 2004-2006
 | 
						|
 *
 | 
						|
 *  This software is provided "as-is", without any express or implied warranty.
 | 
						|
 *  In no event will the authors be held liable for any damages arising from
 | 
						|
 *  the use of this software.
 | 
						|
 *
 | 
						|
 *  Permission is granted to anyone to use this software for any purpose,
 | 
						|
 *  including commercial applications, and to alter it and redistribute it
 | 
						|
 *  freely, subject to the following restrictions:
 | 
						|
 *
 | 
						|
 *  1.  The origin of this software must not be misrepresented; you must not
 | 
						|
 *      claim that you wrote the original software. If you use this software in
 | 
						|
 *      a product, an acknowledgment in the product documentation would be
 | 
						|
 *      appreciated but is not required.
 | 
						|
 *  2.  Altered source versions must be plainly marked as such, and must not be
 | 
						|
 *      misrepresented as being the original software.
 | 
						|
 *  3.  This notice may not be removed or altered from any source distribution.
 | 
						|
 *
 | 
						|
 *  Version: $Id$
 | 
						|
 */
 | 
						|
#include <assert.h>
 | 
						|
#include <stdio.h>
 | 
						|
#include <stddef.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <string.h>
 | 
						|
#include "sc.h"
 | 
						|
 | 
						|
#if !defined TRUE
 | 
						|
  #define FALSE         0
 | 
						|
  #define TRUE          1
 | 
						|
#endif
 | 
						|
#if !defined _MAX_PATH
 | 
						|
  #define _MAX_PATH     250
 | 
						|
#endif
 | 
						|
#if !defined DIRSEP_CHAR
 | 
						|
  #if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__
 | 
						|
    #define DIRSEP_CHAR '/'
 | 
						|
  #elif defined macintosh
 | 
						|
    #define DIRSEP_CHAR ':'
 | 
						|
  #else
 | 
						|
    #define DIRSEP_CHAR '\\'
 | 
						|
  #endif
 | 
						|
#endif
 | 
						|
 | 
						|
#if !defined ELEMENTS
 | 
						|
  #define ELEMENTS(array)       (sizeof(array) / sizeof(array[0]))
 | 
						|
#endif
 | 
						|
 | 
						|
#if !defined NO_CODEPAGE
 | 
						|
 | 
						|
#if !defined MAXCODEPAGE
 | 
						|
  #define MAXCODEPAGE   12      /* typically "cp" + 4 digits + ".txt" */
 | 
						|
#endif
 | 
						|
#define INVALID         0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
 | 
						|
#define LEADBYTE        0xfffeu
 | 
						|
 | 
						|
struct wordpair {
 | 
						|
  unsigned short index;
 | 
						|
  wchar_t code;
 | 
						|
};
 | 
						|
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
 | 
						|
static wchar_t bytetable[256];
 | 
						|
static struct wordpair *wordtable = NULL;
 | 
						|
static unsigned wordtablesize = 0;
 | 
						|
static unsigned wordtabletop = 0;
 | 
						|
 | 
						|
 | 
						|
/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
 | 
						|
 * the string and ignore empty lines
 | 
						|
 * returns 1 for success and 0 for failure
 | 
						|
 */
 | 
						|
static int cp_readline(FILE *fp,char *string,size_t size)
 | 
						|
{
 | 
						|
  size_t count=0;
 | 
						|
  int c;
 | 
						|
  assert(size>1);
 | 
						|
  while ((c=fgetc(fp))!=EOF && count<size-1) {
 | 
						|
    if (c=='\r' || c=='\n') {
 | 
						|
      if (count>0)  /* '\r' or '\n' ends a string */
 | 
						|
        break;
 | 
						|
      /* if count==0, the line started with a '\r' or '\n', or perhaps line
 | 
						|
       * ends in the file are '\r\n' and we read and stopped on the '\r' of
 | 
						|
       * the preceding line
 | 
						|
       */
 | 
						|
    } else {
 | 
						|
      string[count++]=(char)c;
 | 
						|
    } /* if */
 | 
						|
  } /* while */
 | 
						|
  string[count]='\0';
 | 
						|
  return count>0;
 | 
						|
}
 | 
						|
 | 
						|
/* cp_path() sets the directory where all codepage files must be found (if
 | 
						|
 * the parameter to cp_set() specifies a full path, that is used instead).
 | 
						|
 * The path is specified into two parts: root and directory; the full path
 | 
						|
 * for the codepage direcory is just the concatenation of the two, with a
 | 
						|
 * directory separator in between. The directory is given in two parts,
 | 
						|
 * because often a program already retrieves its "home" directory and the
 | 
						|
 * codepages are most conveniently stored in a subdirectory of this home
 | 
						|
 * directory.
 | 
						|
 */
 | 
						|
SC_FUNC int cp_path(const char *root, const char *directory)
 | 
						|
{
 | 
						|
  size_t len1,len2;
 | 
						|
  int add_slash1,add_slash2;
 | 
						|
 | 
						|
  len1= (root!=NULL) ? strlen(root) : 0;
 | 
						|
  add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
 | 
						|
  len2= (directory!=NULL) ? strlen(directory) : 0;
 | 
						|
  add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
 | 
						|
  if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
 | 
						|
    return FALSE;       /* full filename may not fit */
 | 
						|
  if (root!=NULL)
 | 
						|
    strcpy(cprootpath,root);
 | 
						|
  if (add_slash1) {
 | 
						|
    assert(len1==0 || cprootpath[len1]=='\0');
 | 
						|
    cprootpath[len1]=DIRSEP_CHAR;
 | 
						|
    cprootpath[len1+1]='\0';
 | 
						|
  } /* if */
 | 
						|
  if (directory!=NULL)
 | 
						|
    strcat(cprootpath,directory);
 | 
						|
  if (add_slash2) {
 | 
						|
    assert(cprootpath[len1+add_slash1+len2]=='\0');
 | 
						|
    cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
 | 
						|
    cprootpath[len1+add_slash1+len2+1]='\0';
 | 
						|
  } /* if */
 | 
						|
  cp_set(NULL);         /* start with a "linear" table (no translation) */
 | 
						|
  return TRUE;
 | 
						|
}
 | 
						|
 | 
						|
/* cp_set() loads a codepage from a file. The name parameter may be a
 | 
						|
 * filename (including a full path) or it may be a partial codepage name.
 | 
						|
 * If the name parameter is NULL, the codepage is cleared to be a "linear"
 | 
						|
 * table (no translation).
 | 
						|
 * The following files are attempted to open (where <name> specifies the
 | 
						|
 * value of the parameter):
 | 
						|
 *    <name>
 | 
						|
 *    <cprootpath>/<name>
 | 
						|
 *    <cprootpath>/<name>.txt
 | 
						|
 *    <cprootpath>/cp<name>
 | 
						|
 *    <cprootpath>/cp<name>.txt
 | 
						|
 */
 | 
						|
SC_FUNC int cp_set(const char *name)
 | 
						|
{
 | 
						|
  char filename[_MAX_PATH];
 | 
						|
  FILE *fp=NULL;
 | 
						|
  unsigned index;
 | 
						|
 | 
						|
  /* for name==NULL, set up an identity table */
 | 
						|
  if (name==NULL || *name=='\0') {
 | 
						|
    if (wordtable!=NULL) {
 | 
						|
      free(wordtable);
 | 
						|
      wordtable=NULL;
 | 
						|
      wordtablesize=0;
 | 
						|
      wordtabletop=0;
 | 
						|
    } /* if */
 | 
						|
    for (index=0; index<ELEMENTS(bytetable); index++)
 | 
						|
      bytetable[index]=(wchar_t)index;
 | 
						|
    return TRUE;
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  /* try to open the file as-is */
 | 
						|
  if (strchr(name,DIRSEP_CHAR)!=NULL)
 | 
						|
    fp=fopen(name,"rt");
 | 
						|
  if (fp==NULL) {
 | 
						|
    /* try opening the file in the "root path" for codepages */
 | 
						|
    if (strlen(name)>MAXCODEPAGE)
 | 
						|
      return 0;
 | 
						|
    assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
 | 
						|
    strcpy(filename,cprootpath);
 | 
						|
    strcat(filename,name);
 | 
						|
    fp=fopen(filename,"rt");
 | 
						|
  } /* if */
 | 
						|
  if (fp==NULL) {
 | 
						|
    /* try opening the file in the "root path" for codepages, with a ".txt" extension */
 | 
						|
    if (strlen(name)+4>=MAXCODEPAGE)
 | 
						|
      return 0;
 | 
						|
    assert(strlen(filename)+4<_MAX_PATH);
 | 
						|
    strcat(filename,".txt");
 | 
						|
    fp=fopen(filename,"rt");
 | 
						|
  } /* if */
 | 
						|
  if (fp==NULL) {
 | 
						|
    /* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
 | 
						|
    if (strlen(name)+2>MAXCODEPAGE)
 | 
						|
      return 0;
 | 
						|
    assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
 | 
						|
    strcpy(filename,cprootpath);
 | 
						|
    strcat(filename,"cp");
 | 
						|
    strcat(filename,name);
 | 
						|
    fp=fopen(filename,"rt");
 | 
						|
  } /* if */
 | 
						|
  if (fp==NULL) {
 | 
						|
    /* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
 | 
						|
    if (strlen(name)+2+4>MAXCODEPAGE)
 | 
						|
      return 0;
 | 
						|
    assert(strlen(filename)+4<_MAX_PATH);
 | 
						|
    strcat(filename,".txt");
 | 
						|
    fp=fopen(filename,"rt");
 | 
						|
  } /* if */
 | 
						|
  if (fp==NULL)
 | 
						|
    return FALSE;       /* all failed */
 | 
						|
 | 
						|
  /* clear the tables */
 | 
						|
  for (index=0; index<ELEMENTS(bytetable); index++)
 | 
						|
    bytetable[index]=INVALID;   /* special code meaning "not found" */
 | 
						|
  assert((wordtablesize==0 && wordtabletop==0 && wordtable==NULL)
 | 
						|
         || (wordtablesize>0 && wordtable!=NULL));
 | 
						|
  if (wordtable!=NULL) {
 | 
						|
    free(wordtable);
 | 
						|
    wordtable=NULL;
 | 
						|
    wordtablesize=0;
 | 
						|
    wordtabletop=0;
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  /* read in the table */
 | 
						|
  while (cp_readline(fp,filename,sizeof filename)) {
 | 
						|
    char *ptr;
 | 
						|
    if ((ptr=strchr(filename,'#'))!=NULL)
 | 
						|
      *ptr='\0';                /* strip of comment */
 | 
						|
    for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
 | 
						|
      /* nothing */;            /* skip leading whitespace */
 | 
						|
    if (*ptr!='\0') {
 | 
						|
      /* content on line */
 | 
						|
      unsigned code=LEADBYTE;
 | 
						|
      int num=sscanf(ptr,"%i %i",&index,&code);
 | 
						|
      /* if sscanf() returns 1 and the index is in range 0..255, then the
 | 
						|
       * code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
 | 
						|
       * is a double byte pair (lead byte + follower)
 | 
						|
       */
 | 
						|
      if (num>=1 && index<256) {
 | 
						|
        bytetable[index]=(wchar_t)code;
 | 
						|
      } else if (num==2 && index>=256 && index<LEADBYTE) {
 | 
						|
        /* store the DBCS character in wordtable */
 | 
						|
        if (wordtabletop>=wordtablesize) {
 | 
						|
          /* grow the list */
 | 
						|
          int newsize;
 | 
						|
          struct wordpair *newblock;
 | 
						|
          newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
 | 
						|
          newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
 | 
						|
          if (newblock!=NULL) {
 | 
						|
            memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
 | 
						|
            free(wordtable);
 | 
						|
            wordtable=newblock;
 | 
						|
            wordtablesize=newsize;
 | 
						|
          } /* if */
 | 
						|
        } /* if */
 | 
						|
        if (wordtabletop<wordtablesize) {
 | 
						|
          /* insert at sorted position */
 | 
						|
          int pos=wordtabletop;
 | 
						|
          assert(wordtable!=NULL);
 | 
						|
          while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
 | 
						|
            wordtable[pos]=wordtable[pos-1];
 | 
						|
            pos--;
 | 
						|
          } /* while */
 | 
						|
          wordtable[pos].index=(unsigned short)index;
 | 
						|
          wordtable[pos].code=(wchar_t)code;
 | 
						|
        } /* if */
 | 
						|
      } /* if */
 | 
						|
    } /* if */
 | 
						|
  } /* while */
 | 
						|
 | 
						|
  fclose(fp);
 | 
						|
  return TRUE;
 | 
						|
}
 | 
						|
 | 
						|
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr)
 | 
						|
{
 | 
						|
  wchar_t result;
 | 
						|
 | 
						|
  result=bytetable[*string++];
 | 
						|
  /* check whether this is a leader code */
 | 
						|
  if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
 | 
						|
    /* look up the code via binary search */
 | 
						|
    int low,high,mid;
 | 
						|
    unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
 | 
						|
    string++;
 | 
						|
    assert(wordtabletop>0);
 | 
						|
    low=0;
 | 
						|
    high=wordtabletop-1;
 | 
						|
    while (low<high) {
 | 
						|
      mid=(low+high)/2;
 | 
						|
      assert(low<=mid && mid<high);
 | 
						|
      if (index>wordtable[mid].index)
 | 
						|
        low=mid+1;
 | 
						|
      else
 | 
						|
        high=mid;
 | 
						|
    } /* while */
 | 
						|
    assert(low==high);
 | 
						|
    if (wordtable[low].index==index)
 | 
						|
      result=wordtable[low].code;
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  if (endptr!=NULL)
 | 
						|
    *endptr=string;
 | 
						|
  return (cell)result;
 | 
						|
}
 | 
						|
 | 
						|
#endif  /* NO_CODEPAGE */
 | 
						|
 | 
						|
#if !defined NO_UTF8
 | 
						|
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
 | 
						|
{
 | 
						|
  int follow=0;
 | 
						|
  long lowmark=0;
 | 
						|
  unsigned char ch;
 | 
						|
  cell result=0;
 | 
						|
 | 
						|
  if (endptr!=NULL)
 | 
						|
    *endptr=string;
 | 
						|
 | 
						|
  for ( ;; ) {
 | 
						|
    ch=*string++;
 | 
						|
 | 
						|
    if (follow>0 && (ch & 0xc0)==0x80) {
 | 
						|
      /* leader code is active, combine with earlier code */
 | 
						|
      result=(result << 6) | (ch & 0x3f);
 | 
						|
      if (--follow==0) {
 | 
						|
        /* encoding a character in more bytes than is strictly needed,
 | 
						|
         * is not really valid UTF-8; we are strict here to increase
 | 
						|
         * the chance of heuristic dectection of non-UTF-8 text
 | 
						|
         * (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
 | 
						|
         */
 | 
						|
        if (result<lowmark)
 | 
						|
          return -1;
 | 
						|
        /* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
 | 
						|
         * exist in UCS-4 (and hence, they do not exist in Unicode)
 | 
						|
         */
 | 
						|
        if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
 | 
						|
          return -1;
 | 
						|
      } /* if */
 | 
						|
      break;
 | 
						|
    } else if (follow==0 && (ch & 0x80)==0x80) {
 | 
						|
      /* UTF-8 leader code */
 | 
						|
      if ((ch & 0xe0)==0xc0) {
 | 
						|
        /* 110xxxxx 10xxxxxx */
 | 
						|
        follow=1;
 | 
						|
        lowmark=0x80L;
 | 
						|
        result=ch & 0x1f;
 | 
						|
      } else if ((ch & 0xf0)==0xe0) {
 | 
						|
        /* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
 | 
						|
        follow=2;
 | 
						|
        lowmark=0x800L;
 | 
						|
        result=ch & 0x0f;
 | 
						|
      } else if ((ch & 0xf8)==0xf0) {
 | 
						|
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 | 
						|
        follow=3;
 | 
						|
        lowmark=0x10000L;
 | 
						|
        result=ch & 0x07;
 | 
						|
      } else if ((ch & 0xfc)==0xf8) {
 | 
						|
        /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
 | 
						|
        follow=4;
 | 
						|
        lowmark=0x200000L;
 | 
						|
        result=ch & 0x03;
 | 
						|
      } else if ((ch & 0xfe)==0xfc) {
 | 
						|
        /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
 | 
						|
        follow=5;
 | 
						|
        lowmark=0x4000000L;
 | 
						|
        result=ch & 0x01;
 | 
						|
      } else {
 | 
						|
        /* this is invalid UTF-8 */
 | 
						|
        return -1;
 | 
						|
      } /* if */
 | 
						|
    } else if (follow==0 && (ch & 0x80)==0x00) {
 | 
						|
      /* 0xxxxxxx (US-ASCII) */
 | 
						|
      result=ch;
 | 
						|
      break;
 | 
						|
    } else {
 | 
						|
      /* this is invalid UTF-8 */
 | 
						|
      return -1;
 | 
						|
    } /* if */
 | 
						|
 | 
						|
  } /* for */
 | 
						|
 | 
						|
  if (endptr!=NULL)
 | 
						|
    *endptr=string;
 | 
						|
  return result;
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
 | 
						|
{
 | 
						|
  #if defined NO_UTF8
 | 
						|
    return 0;
 | 
						|
  #else
 | 
						|
    static void *resetpos=NULL;
 | 
						|
    int utf8=TRUE;
 | 
						|
    int firstchar=TRUE,bom_found=FALSE;
 | 
						|
    const unsigned char *ptr;
 | 
						|
 | 
						|
    resetpos=pc_getpossrc(fp,resetpos);
 | 
						|
    while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
 | 
						|
      ptr=pline;
 | 
						|
      if (firstchar) {
 | 
						|
        /* check whether the very first character on the very first line
 | 
						|
         * starts with a BYTE order mark
 | 
						|
         */
 | 
						|
        cell c=get_utf8_char(ptr,&ptr);
 | 
						|
        bom_found= (c==0xfeff);
 | 
						|
        utf8= (c>=0);
 | 
						|
        firstchar=FALSE;
 | 
						|
      } /* if */
 | 
						|
      while (utf8 && *ptr!='\0')
 | 
						|
        utf8= (get_utf8_char(ptr,&ptr)>=0);
 | 
						|
    } /* while */
 | 
						|
    pc_resetsrc(fp,resetpos);
 | 
						|
    if (bom_found) {
 | 
						|
      unsigned char bom[3];
 | 
						|
      if (!utf8)
 | 
						|
        error(77,filename);     /* malformed UTF-8 encoding */
 | 
						|
      pc_readsrc(fp,bom,3);
 | 
						|
      assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
 | 
						|
    } /* if */
 | 
						|
    return utf8;
 | 
						|
  #endif  /* NO_UTF8 */
 | 
						|
}
 |