sourcemod/sourcepawn/compiler/sci18n.cpp

/*  Codepage translation to Unicode, and UTF-8 support
 *
 *  The translation is based on codepage mapping files that are distributed
 *  by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
 *
 *  Character sets with a maximum of 256 codes are translated via a lookup
 *  table (these are Single-Byte Character Sets). Character sets like Shift-JIS
 *  with single-byte characters and multi-byte characters (introduced by a
 *  leader byte) are split into two tables: the 256-entry lookup table for
 *  the single-byte characters and an extended table for the multi-byte
 *  characters. The extended table is allocated dynamically; the lookup table
 *  is allocated statically, so loading SBCS tables cannot fail (if the tables
 *  themselves are valid, of course).
 *
 *  Copyright (c) ITB CompuPhase, 2004-2006
 *
 *  This software is provided "as-is", without any express or implied warranty.
 *  In no event will the authors be held liable for any damages arising from
 *  the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1.  The origin of this software must not be misrepresented; you must not
 *      claim that you wrote the original software. If you use this software in
 *      a product, an acknowledgment in the product documentation would be
 *      appreciated but is not required.
 *  2.  Altered source versions must be plainly marked as such, and must not be
 *      misrepresented as being the original software.
 *  3.  This notice may not be removed or altered from any source distribution.
 *
 *  Version: $Id$
 */
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "sc.h"

#if !defined TRUE
  #define FALSE         0
  #define TRUE          1
#endif
#if !defined _MAX_PATH
  #define _MAX_PATH     250
#endif
#if !defined DIRSEP_CHAR
  #if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__
    #define DIRSEP_CHAR '/'
  #elif defined macintosh
    #define DIRSEP_CHAR ':'
  #else
    #define DIRSEP_CHAR '\\'
  #endif
#endif

#if !defined ELEMENTS
  #define ELEMENTS(array)       (sizeof(array) / sizeof(array[0]))
#endif

#if !defined NO_CODEPAGE

#if !defined MAXCODEPAGE
  #define MAXCODEPAGE   12      /* typically "cp" + 4 digits + ".txt" */
#endif
#define INVALID         0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
#define LEADBYTE        0xfffeu

struct wordpair {
  unsigned short index;
  wchar_t code;
};
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
static wchar_t bytetable[256];
static struct wordpair *wordtable = NULL;
static unsigned wordtablesize = 0;
static unsigned wordtabletop = 0;


/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
 * the string and ignore empty lines
 * returns 1 for success and 0 for failure
 */
static int cp_readline(FILE *fp,char *string,size_t size)
{
  size_t count=0;
  int c;
  assert(size>1);
  while ((c=fgetc(fp))!=EOF && count<size-1) {
    if (c=='\r' || c=='\n') {
      if (count>0)  /* '\r' or '\n' ends a string */
        break;
      /* if count==0, the line started with a '\r' or '\n', or perhaps line
       * ends in the file are '\r\n' and we read and stopped on the '\r' of
       * the preceding line
       */
    } else {
      string[count++]=(char)c;
    } /* if */
  } /* while */
  string[count]='\0';
  return count>0;
}

/* cp_path() sets the directory where all codepage files must be found (if
 * the parameter to cp_set() specifies a full path, that is used instead).
 * The path is specified into two parts: root and directory; the full path
 * for the codepage direcory is just the concatenation of the two, with a
 * directory separator in between. The directory is given in two parts,
 * because often a program already retrieves its "home" directory and the
 * codepages are most conveniently stored in a subdirectory of this home
 * directory.
 */
int cp_path(const char *root, const char *directory)
{
  size_t len1,len2;
  int add_slash1,add_slash2;

  len1= (root!=NULL) ? strlen(root) : 0;
  add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
  len2= (directory!=NULL) ? strlen(directory) : 0;
  add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
  if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
    return FALSE;       /* full filename may not fit */
  if (root!=NULL)
    strcpy(cprootpath,root);
  if (add_slash1) {
    assert(len1==0 || cprootpath[len1]=='\0');
    cprootpath[len1]=DIRSEP_CHAR;
    cprootpath[len1+1]='\0';
  } /* if */
  if (directory!=NULL)
    strcat(cprootpath,directory);
  if (add_slash2) {
    assert(cprootpath[len1+add_slash1+len2]=='\0');
    cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
    cprootpath[len1+add_slash1+len2+1]='\0';
  } /* if */
  cp_set(NULL);         /* start with a "linear" table (no translation) */
  return TRUE;
}

/* cp_set() loads a codepage from a file. The name parameter may be a
 * filename (including a full path) or it may be a partial codepage name.
 * If the name parameter is NULL, the codepage is cleared to be a "linear"
 * table (no translation).
 * The following files are attempted to open (where <name> specifies the
 * value of the parameter):
 *    <name>
 *    <cprootpath>/<name>
 *    <cprootpath>/<name>.txt
 *    <cprootpath>/cp<name>
 *    <cprootpath>/cp<name>.txt
 */
int cp_set(const char *name)
{
  char filename[_MAX_PATH];
  FILE *fp=NULL;
  unsigned index;

  /* for name==NULL, set up an identity table */
  if (name==NULL || *name=='\0') {
    if (wordtable!=NULL) {
      free(wordtable);
      wordtable=NULL;
      wordtablesize=0;
      wordtabletop=0;
    } /* if */
    for (index=0; index<ELEMENTS(bytetable); index++)
      bytetable[index]=(wchar_t)index;
    return TRUE;
  } /* if */

  /* try to open the file as-is */
  if (strchr(name,DIRSEP_CHAR)!=NULL)
    fp=fopen(name,"rt");
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages */
    if (strlen(name)>MAXCODEPAGE)
      return 0;
    assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
    strcpy(filename,cprootpath);
    strcat(filename,name);
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with a ".txt" extension */
    if (strlen(name)+4>=MAXCODEPAGE)
      return 0;
    assert(strlen(filename)+4<_MAX_PATH);
    strcat(filename,".txt");
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
    if (strlen(name)+2>MAXCODEPAGE)
      return 0;
    assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
    strcpy(filename,cprootpath);
    strcat(filename,"cp");
    strcat(filename,name);
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
    if (strlen(name)+2+4>MAXCODEPAGE)
      return 0;
    assert(strlen(filename)+4<_MAX_PATH);
    strcat(filename,".txt");
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL)
    return FALSE;       /* all failed */

  /* clear the tables */
  for (index=0; index<ELEMENTS(bytetable); index++)
    bytetable[index]=INVALID;   /* special code meaning "not found" */
  assert((wordtablesize==0 && wordtabletop==0 && wordtable==NULL)
         || (wordtablesize>0 && wordtable!=NULL));
  if (wordtable!=NULL) {
    free(wordtable);
    wordtable=NULL;
    wordtablesize=0;
    wordtabletop=0;
  } /* if */

  /* read in the table */
  while (cp_readline(fp,filename,sizeof filename)) {
    char *ptr;
    if ((ptr=strchr(filename,'#'))!=NULL)
      *ptr='\0';                /* strip of comment */
    for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
      /* nothing */;            /* skip leading whitespace */
    if (*ptr!='\0') {
      /* content on line */
      unsigned code=LEADBYTE;
      int num=sscanf(ptr,"%i %i",&index,&code);
      /* if sscanf() returns 1 and the index is in range 0..255, then the
       * code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
       * is a double byte pair (lead byte + follower)
       */
      if (num>=1 && index<256) {
        bytetable[index]=(wchar_t)code;
      } else if (num==2 && index>=256 && index<LEADBYTE) {
        /* store the DBCS character in wordtable */
        if (wordtabletop>=wordtablesize) {
          /* grow the list */
          int newsize;
          struct wordpair *newblock;
          newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
          newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
          if (newblock!=NULL) {
            memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
            free(wordtable);
            wordtable=newblock;
            wordtablesize=newsize;
          } /* if */
        } /* if */
        if (wordtabletop<wordtablesize) {
          /* insert at sorted position */
          int pos=wordtabletop;
          assert(wordtable!=NULL);
          while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
            wordtable[pos]=wordtable[pos-1];
            pos--;
          } /* while */
          wordtable[pos].index=(unsigned short)index;
          wordtable[pos].code=(wchar_t)code;
        } /* if */
      } /* if */
    } /* if */
  } /* while */

  fclose(fp);
  return TRUE;
}

cell cp_translate(const unsigned char *string,const unsigned char **endptr)
{
  wchar_t result;

  result=bytetable[*string++];
  /* check whether this is a leader code */
  if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
    /* look up the code via binary search */
    int low,high,mid;
    unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
    string++;
    assert(wordtabletop>0);
    low=0;
    high=wordtabletop-1;
    while (low<high) {
      mid=(low+high)/2;
      assert(low<=mid && mid<high);
      if (index>wordtable[mid].index)
        low=mid+1;
      else
        high=mid;
    } /* while */
    assert(low==high);
    if (wordtable[low].index==index)
      result=wordtable[low].code;
  } /* if */

  if (endptr!=NULL)
    *endptr=string;
  return (cell)result;
}

#endif  /* NO_CODEPAGE */

#if !defined NO_UTF8
cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
{
  int follow=0;
  long lowmark=0;
  unsigned char ch;
  cell result=0;

  if (endptr!=NULL)
    *endptr=string;

  for ( ;; ) {
    ch=*string++;

    if (follow>0 && (ch & 0xc0)==0x80) {
      /* leader code is active, combine with earlier code */
      result=(result << 6) | (ch & 0x3f);
      if (--follow==0) {
        /* encoding a character in more bytes than is strictly needed,
         * is not really valid UTF-8; we are strict here to increase
         * the chance of heuristic dectection of non-UTF-8 text
         * (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
         */
        if (result<lowmark)
          return -1;
        /* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
         * exist in UCS-4 (and hence, they do not exist in Unicode)
         */
        if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
          return -1;
      } /* if */
      break;
    } else if (follow==0 && (ch & 0x80)==0x80) {
      /* UTF-8 leader code */
      if ((ch & 0xe0)==0xc0) {
        /* 110xxxxx 10xxxxxx */
        follow=1;
        lowmark=0x80L;
        result=ch & 0x1f;
      } else if ((ch & 0xf0)==0xe0) {
        /* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
        follow=2;
        lowmark=0x800L;
        result=ch & 0x0f;
      } else if ((ch & 0xf8)==0xf0) {
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
        follow=3;
        lowmark=0x10000L;
        result=ch & 0x07;
      } else if ((ch & 0xfc)==0xf8) {
        /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
        follow=4;
        lowmark=0x200000L;
        result=ch & 0x03;
      } else if ((ch & 0xfe)==0xfc) {
        /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
        follow=5;
        lowmark=0x4000000L;
        result=ch & 0x01;
      } else {
        /* this is invalid UTF-8 */
        return -1;
      } /* if */
    } else if (follow==0 && (ch & 0x80)==0x00) {
      /* 0xxxxxxx (US-ASCII) */
      result=ch;
      break;
    } else {
      /* this is invalid UTF-8 */
      return -1;
    } /* if */

  } /* for */

  if (endptr!=NULL)
    *endptr=string;
  return result;
}
#endif

int scan_utf8(void *fp,const char *filename)
{
  #if defined NO_UTF8
    return 0;
  #else
    static void *resetpos=NULL;
    int utf8=TRUE;
    int firstchar=TRUE,bom_found=FALSE;
    const unsigned char *ptr;

    resetpos=pc_getpossrc(fp,resetpos);
    while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
      ptr=pline;
      if (firstchar) {
        /* check whether the very first character on the very first line
         * starts with a BYTE order mark
         */
        cell c=get_utf8_char(ptr,&ptr);
        bom_found= (c==0xfeff);
        utf8= (c>=0);
        firstchar=FALSE;
      } /* if */
      while (utf8 && *ptr!='\0')
        utf8= (get_utf8_char(ptr,&ptr)>=0);
    } /* while */
    pc_resetsrc(fp,resetpos);
    if (bom_found) {
      unsigned char bom[3];
      if (!utf8)
        error(77,filename);     /* malformed UTF-8 encoding */
      pc_readsrc(fp,bom,3);
      assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
    } /* if */
    return utf8;
  #endif  /* NO_UTF8 */
}