sourcemod/sourcepawn/compiler/sci18n.c
Scott Ehlert 251cced1f8 Spring Cleaning, Part Ichi (1)
Various minor things done to project files
Updated sample extension project file and updated makefile to the new unified version (more changes likely on the way)
Updated regex project file and makefile

--HG--
extra : convert_revision : svn%3A39bc706e-5318-0410-9160-8a85361fbb7c/trunk%401971
2008-03-30 07:00:22 +00:00

429 lines
14 KiB
C

/* Codepage translation to Unicode, and UTF-8 support
*
* The translation is based on codepage mapping files that are distributed
* by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
*
* Character sets with a maximum of 256 codes are translated via a lookup
* table (these are Single-Byte Character Sets). Character sets like Shift-JIS
* with single-byte characters and multi-byte characters (introduced by a
* leader byte) are split into two tables: the 256-entry lookup table for
* the single-byte characters and an extended table for the multi-byte
* characters. The extended table is allocated dynamically; the lookup table
* is allocated statically, so loading SBCS tables cannot fail (if the tables
* themselves are valid, of course).
*
* Copyright (c) ITB CompuPhase, 2004-2006
*
* This software is provided "as-is", without any express or implied warranty.
* In no event will the authors be held liable for any damages arising from
* the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software in
* a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* Version: $Id$
*/
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "sc.h"
#if !defined TRUE
#define FALSE 0
#define TRUE 1
#endif
#if !defined _MAX_PATH
#define _MAX_PATH 250
#endif
#if !defined DIRSEP_CHAR
#if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__
#define DIRSEP_CHAR '/'
#elif defined macintosh
#define DIRSEP_CHAR ':'
#else
#define DIRSEP_CHAR '\\'
#endif
#endif
#if !defined ELEMENTS
#define ELEMENTS(array) (sizeof(array) / sizeof(array[0]))
#endif
#if !defined NO_CODEPAGE
#if !defined MAXCODEPAGE
#define MAXCODEPAGE 12 /* typically "cp" + 4 digits + ".txt" */
#endif
#define INVALID 0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
#define LEADBYTE 0xfffeu
struct wordpair {
unsigned short index;
wchar_t code;
};
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
static wchar_t bytetable[256];
static struct wordpair *wordtable = NULL;
static unsigned wordtablesize = 0;
static unsigned wordtabletop = 0;
/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
* the string and ignore empty lines
* returns 1 for success and 0 for failure
*/
static int cp_readline(FILE *fp,char *string,size_t size)
{
size_t count=0;
int c;
assert(size>1);
while ((c=fgetc(fp))!=EOF && count<size-1) {
if (c=='\r' || c=='\n') {
if (count>0) /* '\r' or '\n' ends a string */
break;
/* if count==0, the line started with a '\r' or '\n', or perhaps line
* ends in the file are '\r\n' and we read and stopped on the '\r' of
* the preceding line
*/
} else {
string[count++]=(char)c;
} /* if */
} /* while */
string[count]='\0';
return count>0;
}
/* cp_path() sets the directory where all codepage files must be found (if
* the parameter to cp_set() specifies a full path, that is used instead).
* The path is specified into two parts: root and directory; the full path
* for the codepage direcory is just the concatenation of the two, with a
* directory separator in between. The directory is given in two parts,
* because often a program already retrieves its "home" directory and the
* codepages are most conveniently stored in a subdirectory of this home
* directory.
*/
SC_FUNC int cp_path(const char *root, const char *directory)
{
size_t len1,len2;
int add_slash1,add_slash2;
len1= (root!=NULL) ? strlen(root) : 0;
add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
len2= (directory!=NULL) ? strlen(directory) : 0;
add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
return FALSE; /* full filename may not fit */
if (root!=NULL)
strcpy(cprootpath,root);
if (add_slash1) {
assert(len1==0 || cprootpath[len1]=='\0');
cprootpath[len1]=DIRSEP_CHAR;
cprootpath[len1+1]='\0';
} /* if */
if (directory!=NULL)
strcat(cprootpath,directory);
if (add_slash2) {
assert(cprootpath[len1+add_slash1+len2]=='\0');
cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
cprootpath[len1+add_slash1+len2+1]='\0';
} /* if */
cp_set(NULL); /* start with a "linear" table (no translation) */
return TRUE;
}
/* cp_set() loads a codepage from a file. The name parameter may be a
* filename (including a full path) or it may be a partial codepage name.
* If the name parameter is NULL, the codepage is cleared to be a "linear"
* table (no translation).
* The following files are attempted to open (where <name> specifies the
* value of the parameter):
* <name>
* <cprootpath>/<name>
* <cprootpath>/<name>.txt
* <cprootpath>/cp<name>
* <cprootpath>/cp<name>.txt
*/
SC_FUNC int cp_set(const char *name)
{
char filename[_MAX_PATH];
FILE *fp=NULL;
unsigned index;
/* for name==NULL, set up an identity table */
if (name==NULL || *name=='\0') {
if (wordtable!=NULL) {
free(wordtable);
wordtable=NULL;
wordtablesize=0;
wordtabletop=0;
} /* if */
for (index=0; index<ELEMENTS(bytetable); index++)
bytetable[index]=(wchar_t)index;
return TRUE;
} /* if */
/* try to open the file as-is */
if (strchr(name,DIRSEP_CHAR)!=NULL)
fp=fopen(name,"rt");
if (fp==NULL) {
/* try opening the file in the "root path" for codepages */
if (strlen(name)>MAXCODEPAGE)
return 0;
assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
strcpy(filename,cprootpath);
strcat(filename,name);
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with a ".txt" extension */
if (strlen(name)+4>=MAXCODEPAGE)
return 0;
assert(strlen(filename)+4<_MAX_PATH);
strcat(filename,".txt");
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
if (strlen(name)+2>MAXCODEPAGE)
return 0;
assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
strcpy(filename,cprootpath);
strcat(filename,"cp");
strcat(filename,name);
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
if (strlen(name)+2+4>MAXCODEPAGE)
return 0;
assert(strlen(filename)+4<_MAX_PATH);
strcat(filename,".txt");
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL)
return FALSE; /* all failed */
/* clear the tables */
for (index=0; index<ELEMENTS(bytetable); index++)
bytetable[index]=INVALID; /* special code meaning "not found" */
assert(wordtablesize==0 && wordtabletop==0 && wordtable==NULL
|| wordtablesize>0 && wordtable!=NULL);
if (wordtable!=NULL) {
free(wordtable);
wordtable=NULL;
wordtablesize=0;
wordtabletop=0;
} /* if */
/* read in the table */
while (cp_readline(fp,filename,sizeof filename)) {
char *ptr;
if ((ptr=strchr(filename,'#'))!=NULL)
*ptr='\0'; /* strip of comment */
for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
/* nothing */; /* skip leading whitespace */
if (*ptr!='\0') {
/* content on line */
unsigned code=LEADBYTE;
int num=sscanf(ptr,"%i %i",&index,&code);
/* if sscanf() returns 1 and the index is in range 0..255, then the
* code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
* is a double byte pair (lead byte + follower)
*/
if (num>=1 && index<256) {
bytetable[index]=(wchar_t)code;
} else if (num==2 && index>=256 && index<LEADBYTE) {
/* store the DBCS character in wordtable */
if (wordtabletop>=wordtablesize) {
/* grow the list */
int newsize;
struct wordpair *newblock;
newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
if (newblock!=NULL) {
memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
free(wordtable);
wordtable=newblock;
wordtablesize=newsize;
} /* if */
} /* if */
if (wordtabletop<wordtablesize) {
/* insert at sorted position */
int pos=wordtabletop;
assert(wordtable!=NULL);
while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
wordtable[pos]=wordtable[pos-1];
pos--;
} /* while */
wordtable[pos].index=(unsigned short)index;
wordtable[pos].code=(wchar_t)code;
} /* if */
} /* if */
} /* if */
} /* while */
fclose(fp);
return TRUE;
}
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr)
{
wchar_t result;
result=bytetable[*string++];
/* check whether this is a leader code */
if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
/* look up the code via binary search */
int low,high,mid;
unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
string++;
assert(wordtabletop>0);
low=0;
high=wordtabletop-1;
while (low<high) {
mid=(low+high)/2;
assert(low<=mid && mid<high);
if (index>wordtable[mid].index)
low=mid+1;
else
high=mid;
} /* while */
assert(low==high);
if (wordtable[low].index==index)
result=wordtable[low].code;
} /* if */
if (endptr!=NULL)
*endptr=string;
return (cell)result;
}
#endif /* NO_CODEPAGE */
#if !defined NO_UTF8
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
{
int follow=0;
long lowmark=0;
unsigned char ch;
cell result=0;
if (endptr!=NULL)
*endptr=string;
for ( ;; ) {
ch=*string++;
if (follow>0 && (ch & 0xc0)==0x80) {
/* leader code is active, combine with earlier code */
result=(result << 6) | (ch & 0x3f);
if (--follow==0) {
/* encoding a character in more bytes than is strictly needed,
* is not really valid UTF-8; we are strict here to increase
* the chance of heuristic dectection of non-UTF-8 text
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
*/
if (result<lowmark)
return -1;
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
* exist in UCS-4 (and hence, they do not exist in Unicode)
*/
if (result>=0xd800 && result<=0xdfff || result==0xfffe || result==0xffff)
return -1;
} /* if */
break;
} else if (follow==0 && (ch & 0x80)==0x80) {
/* UTF-8 leader code */
if ((ch & 0xe0)==0xc0) {
/* 110xxxxx 10xxxxxx */
follow=1;
lowmark=0x80L;
result=ch & 0x1f;
} else if ((ch & 0xf0)==0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
follow=2;
lowmark=0x800L;
result=ch & 0x0f;
} else if ((ch & 0xf8)==0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=3;
lowmark=0x10000L;
result=ch & 0x07;
} else if ((ch & 0xfc)==0xf8) {
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=4;
lowmark=0x200000L;
result=ch & 0x03;
} else if ((ch & 0xfe)==0xfc) {
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
follow=5;
lowmark=0x4000000L;
result=ch & 0x01;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} else if (follow==0 && (ch & 0x80)==0x00) {
/* 0xxxxxxx (US-ASCII) */
result=ch;
break;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} /* for */
if (endptr!=NULL)
*endptr=string;
return result;
}
#endif
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
{
#if defined NO_UTF8
return 0;
#else
void *resetpos=pc_getpossrc(fp);
int utf8=TRUE;
int firstchar=TRUE,bom_found=FALSE;
const unsigned char *ptr;
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
ptr=pline;
if (firstchar) {
/* check whether the very first character on the very first line
* starts with a BYTE order mark
*/
cell c=get_utf8_char(ptr,&ptr);
bom_found= (c==0xfeff);
utf8= (c>=0);
firstchar=FALSE;
} /* if */
while (utf8 && *ptr!='\0')
utf8= (get_utf8_char(ptr,&ptr)>=0);
} /* while */
pc_resetsrc(fp,resetpos);
if (bom_found) {
unsigned char bom[3];
if (!utf8)
error(77,filename); /* malformed UTF-8 encoding */
pc_readsrc(fp,bom,3);
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
} /* if */
return utf8;
#endif /* NO_UTF8 */
}