251cced1f8
Various minor things done to project files Updated sample extension project file and updated makefile to the new unified version (more changes likely on the way) Updated regex project file and makefile --HG-- extra : convert_revision : svn%3A39bc706e-5318-0410-9160-8a85361fbb7c/trunk%401971
451 lines
13 KiB
C
451 lines
13 KiB
C
/* compress.c -- Byte Pair Encoding compression */
|
|
/* Copyright 1996 Philip Gage */
|
|
|
|
/* This program appeared in the September 1997 issue of
|
|
* C/C++ Users Journal. The original source code may still
|
|
* be found at the web site of the magazine (www.cuj.com).
|
|
*
|
|
* It has been modified by me (Thiadmer Riemersma) to
|
|
* compress only a section of the input file and to store
|
|
* the compressed output along with the input as "C" strings.
|
|
*
|
|
* Compiling instructions:
|
|
* Borland C++ 16-bit (large memory model is required):
|
|
* bcc -ml scpack.c
|
|
*
|
|
* Watcom C/C++ 32-bit:
|
|
* wcl386 scpack.c
|
|
*
|
|
* GNU C (Linux), 32-bit:
|
|
* gcc scpack.c -o scpack
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <limits.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#if UINT_MAX > 0xFFFFU
|
|
#define MAXSIZE 1024*1024L
|
|
#else
|
|
#define MAXSIZE UINT_MAX /* Input file buffer size */
|
|
#endif
|
|
#define HASHSIZE 8192 /* Hash table size, power of 2 */
|
|
#define THRESHOLD 3 /* Increase for speed, min 3 */
|
|
|
|
#define START_TOKEN "#ifdef SCPACK" /* start reading the buffer here */
|
|
#define NAME_TOKEN "#define SCPACK_TABLE"
|
|
#define SEP_TOKEN "#define SCPACK_SEPARATOR"
|
|
#define TERM_TOKEN "#define SCPACK_TERMINATOR"
|
|
#define TEMPFILE "~SCPACK.TMP"
|
|
static char tablename[32+1] = "scpack_table";
|
|
static char separator[16]=",";
|
|
static char terminator[16]="";
|
|
|
|
int compress(unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
|
|
{
|
|
unsigned char *left, *right, *count;
|
|
unsigned char a, b, bestcount;
|
|
unsigned i, j, index, bestindex, code=128;
|
|
|
|
/* Dynamically allocate buffers and check for errors */
|
|
left = (unsigned char *)malloc(HASHSIZE);
|
|
right = (unsigned char *)malloc(HASHSIZE);
|
|
count = (unsigned char *)malloc(HASHSIZE);
|
|
if (left==NULL || right==NULL || count==NULL) {
|
|
printf("Error allocating memory\n");
|
|
exit(1);
|
|
}
|
|
|
|
/* Check for errors */
|
|
for (i=0; i<buffersize; i++)
|
|
if (buffer[i] > 127) {
|
|
printf("This program works only on text files (7-bit ASCII)\n");
|
|
exit(1);
|
|
}
|
|
|
|
memset(pairtable, 0, 128*2*sizeof(char));
|
|
|
|
do { /* Replace frequent pairs with bytes 128..255 */
|
|
|
|
/* Enter counts of all byte pairs into hash table */
|
|
memset(count,0,HASHSIZE);
|
|
for (i=0; i<buffersize-1; i++) {
|
|
a = buffer[i];
|
|
b = buffer[i+1];
|
|
/* ignore any pair with a '\0' */
|
|
if (a == 0 || b == 0)
|
|
continue;
|
|
index = (a ^ (b << 6)) & (HASHSIZE-1);
|
|
while ((left[index] != a || right[index] != b) &&
|
|
count[index] != 0)
|
|
index = (index + 1) & (HASHSIZE-1);
|
|
left[index] = a;
|
|
right[index] = b;
|
|
if (count[index] < 255)
|
|
count[index] += (unsigned char)1;
|
|
}
|
|
|
|
/* Search hash table for most frequent pair */
|
|
bestcount = THRESHOLD - 1;
|
|
for (i=0; i<HASHSIZE; i++) {
|
|
if (count[i] > bestcount) {
|
|
bestcount = count[i];
|
|
bestindex = i;
|
|
}
|
|
}
|
|
|
|
/* Compress if enough occurrences of pair */
|
|
if (bestcount >= THRESHOLD) {
|
|
|
|
/* Add pair to table using code as index */
|
|
a = pairtable[code-128][0] = left[bestindex];
|
|
b = pairtable[code-128][1] = right[bestindex];
|
|
|
|
/* Replace all pair occurrences with unused byte */
|
|
for (i=0, j=0; i<buffersize; i++, j++)
|
|
if (a == buffer[i] && b == buffer[i+1]) {
|
|
buffer[j] = (unsigned char)code;
|
|
++i;
|
|
}
|
|
else
|
|
buffer[j] = buffer[i];
|
|
buffersize = j;
|
|
}
|
|
else
|
|
break;
|
|
} while (++code < 255);
|
|
|
|
/* done */
|
|
free(left); free(right); free(count);
|
|
return buffersize; /* return adjusted buffersize */
|
|
}
|
|
|
|
static int strmatch(char *str, char *token, int *indent)
|
|
{
|
|
int i = 0;
|
|
|
|
/* skip whitespace */
|
|
while (*str==' ' || *str=='\t') {
|
|
str++;
|
|
i++;
|
|
} /* while */
|
|
if (strncmp(str,token,strlen(token))!=0)
|
|
return 0;
|
|
if (indent != NULL)
|
|
*indent = i;
|
|
return 1;
|
|
}
|
|
|
|
static void check_if(char *str,int linenr)
|
|
{
|
|
if (strmatch(str,"#if",NULL)) {
|
|
printf("Error: \"#if...\" preprocessor statement should not be in SCPACK section "
|
|
"(line %d)\n", linenr);
|
|
exit(1);
|
|
} /* if */
|
|
}
|
|
|
|
static int check_tablename(char *str)
|
|
{
|
|
int i;
|
|
|
|
if (strmatch(str,NAME_TOKEN,NULL)) {
|
|
str += strlen(NAME_TOKEN);
|
|
while (*str==' ' || *str=='\t')
|
|
str++;
|
|
for (i=0; i<(sizeof tablename - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
|
tablename[i] = *str;
|
|
tablename[i] = '\0';
|
|
return 1;
|
|
} /* if */
|
|
return 0;
|
|
}
|
|
|
|
static int check_separator(char *str)
|
|
{
|
|
int i;
|
|
|
|
if (strmatch(str,SEP_TOKEN,NULL)) {
|
|
str += strlen(SEP_TOKEN);
|
|
while (*str==' ' || *str=='\t')
|
|
str++;
|
|
for (i=0; i<(sizeof separator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
|
separator[i] = *str;
|
|
separator[i] = '\0';
|
|
return 1;
|
|
} /* if */
|
|
|
|
if (strmatch(str,TERM_TOKEN,NULL)) {
|
|
str += strlen(TERM_TOKEN);
|
|
while (*str==' ' || *str=='\t')
|
|
str++;
|
|
for (i=0; i<(sizeof terminator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
|
terminator[i] = *str;
|
|
terminator[i] = '\0';
|
|
return 1;
|
|
} /* if */
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* readbuffer
|
|
* Reads in the input file and stores all strings in the
|
|
* section between "#ifdef SCPACK" and "#else" in a buffer.
|
|
* Only text that is between double quotes is added to the
|
|
* buffer; the \" escape code is handled. Multiple strings
|
|
* on one line are handled.
|
|
*/
|
|
unsigned readbuffer(FILE *input, unsigned char *buffer)
|
|
{
|
|
char str[256];
|
|
unsigned buffersize;
|
|
int i,linenr;
|
|
|
|
linenr=0;
|
|
buffersize=0;
|
|
|
|
rewind(input);
|
|
while (!feof(input)) {
|
|
while (fgets(str,sizeof str,input)!=NULL) {
|
|
linenr++;
|
|
check_tablename(str);
|
|
check_separator(str);
|
|
if (strmatch(str,START_TOKEN,NULL))
|
|
break;
|
|
} /* while */
|
|
if (!strmatch(str,START_TOKEN,NULL))
|
|
return buffersize; /* no (more) section found, quit */
|
|
|
|
while (fgets(str,sizeof str,input)!=NULL) {
|
|
linenr++;
|
|
check_if(str,linenr);
|
|
if (check_tablename(str))
|
|
printf("Error: table name definition should not be in SCPACK section (line %d)\n", linenr);
|
|
check_separator(str);
|
|
if (strmatch(str,"#else",NULL))
|
|
break; /* done */
|
|
/* add to the buffer only what is between double quotes */
|
|
i=0;
|
|
do {
|
|
while (str[i]!='\0' && str[i]!='"')
|
|
i++;
|
|
if (str[i]=='"') {
|
|
/* we are in a string */
|
|
i++;
|
|
while (str[i]!='\0' && str[i]!='"') {
|
|
/* handle escape sequences */
|
|
if (str[i]=='\\') {
|
|
i++;
|
|
switch (str[i]) {
|
|
case 'a': /* alarm */
|
|
buffer[buffersize++]='\a';
|
|
i++;
|
|
break;
|
|
case 'b': /* backspace */
|
|
buffer[buffersize++]='\b';
|
|
i++;
|
|
break;
|
|
case 'f': /* form feed */
|
|
buffer[buffersize++]='\f';
|
|
i++;
|
|
break;
|
|
case 'n': /* newline */
|
|
buffer[buffersize++]='\n';
|
|
i++;
|
|
break;
|
|
case 'r': /* carriage return */
|
|
buffer[buffersize++]='\n';
|
|
i++;
|
|
break;
|
|
case 't': /* tab */
|
|
buffer[buffersize++]='\t';
|
|
i++;
|
|
break;
|
|
case '\'':
|
|
buffer[buffersize++]='\'';
|
|
i++;
|
|
break;
|
|
case '"':
|
|
buffer[buffersize++]='"';
|
|
i++;
|
|
break;
|
|
default:
|
|
// ??? octal character code escapes and hexadecimal escapes
|
|
// not supported
|
|
printf("Unknown escape sequence '\\%c' on line %d\n",
|
|
str[i], linenr);
|
|
} /* switch */
|
|
} else {
|
|
buffer[buffersize++]=str[i++];
|
|
} /* if */
|
|
} /* while */
|
|
if (str[i]=='"') {
|
|
buffer[buffersize++]='\0'; /* terminate each string */
|
|
i++;
|
|
} else {
|
|
printf("Error: unterminated string on line %d\n",linenr);
|
|
} /* if */
|
|
} /* if */
|
|
} while (str[i]!='\0');
|
|
} /* while - in SCPACK section */
|
|
/* put in another '\0' to terminate the section */
|
|
buffer[buffersize++]='\0';
|
|
} /* while - !feof(input) */
|
|
return buffersize;
|
|
}
|
|
|
|
static void write_pairtable(FILE *output, unsigned char pairtable[128][2], char *tablename)
|
|
{
|
|
int i;
|
|
|
|
/* dump the pair table */
|
|
fprintf(output, "/*-*SCPACK start of pair table, do not change or remove this line */\n");
|
|
fprintf(output, "unsigned char %s[][2] = {", tablename);
|
|
for (i=0; i<128 && pairtable[i][0]!=0 && pairtable[i][1]!=0; i++) {
|
|
if ((i % 16)==0)
|
|
fprintf(output, "\n ");
|
|
else
|
|
fprintf(output, " ");
|
|
fprintf(output, "{%d,%d}", pairtable[i][0], pairtable[i][1]);
|
|
/* check if something follows this pair */
|
|
if (i+1<128 && pairtable[i+1][0]!=0 && pairtable[i+1][1]!=0)
|
|
fprintf(output, ",");
|
|
} /* for */
|
|
fprintf(output, "\n};\n");
|
|
fprintf(output, "/*-*SCPACK end of pair table, do not change or remove this line */\n");
|
|
}
|
|
|
|
void writefile(FILE *input, FILE *output, unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
|
|
{
|
|
char str[256];
|
|
int insection, indent, needseparator;
|
|
unsigned char *bufptr;
|
|
|
|
bufptr = buffer;
|
|
insection = 0;
|
|
|
|
rewind(input);
|
|
while (!feof(input)) {
|
|
while (fgets(str,sizeof str,input)!=NULL) {
|
|
fprintf(output,"%s",str);
|
|
if (check_tablename(str)) {
|
|
write_pairtable(output, pairtable, tablename);
|
|
/* strip an existing pair table from the file */
|
|
if (fgets(str,sizeof str,input)!=NULL) {
|
|
if (strmatch(str,"/*-*SCPACK",NULL)) {
|
|
while (fgets(str,sizeof str,input)!=NULL)
|
|
if (strmatch(str,"/*-*SCPACK",NULL))
|
|
break;
|
|
} else {
|
|
fprintf(output,"%s",str);
|
|
} /* if */
|
|
} /* if */
|
|
} /* if */
|
|
if (strmatch(str,START_TOKEN,NULL))
|
|
insection = 1;
|
|
if (insection && strmatch(str,"#else",NULL))
|
|
break;
|
|
} /* while */
|
|
if (!strmatch(str,"#else",&indent))
|
|
return; /* no (more) section found, quit */
|
|
insection=0;
|
|
|
|
/* dump the buffer as strings, separated with commas */
|
|
needseparator = 0;
|
|
while (*bufptr != '\0') {
|
|
assert((unsigned)(bufptr-buffer) < buffersize);
|
|
if (needseparator)
|
|
fprintf(output, "%s\n",separator);
|
|
fprintf(output, "%*c\"",indent+2,' ');
|
|
/* loop over string */
|
|
while (*bufptr != '\0') {
|
|
if (*bufptr<' ' || *bufptr >= 128 || *bufptr == '"' || *bufptr == '\\')
|
|
fprintf(output, "\\%03o", *bufptr);
|
|
else
|
|
fprintf(output, "%c", *bufptr);
|
|
bufptr++;
|
|
} /* while */
|
|
fprintf(output, "\"");
|
|
needseparator = 1;
|
|
bufptr++; /* skip '\0' */
|
|
} /* while */
|
|
fprintf(output, "%s\n",terminator);
|
|
bufptr++;
|
|
|
|
/* skip the input file until the #endif section */
|
|
while (fgets(str,sizeof str,input)!=NULL) {
|
|
if (strmatch(str,"#endif",NULL)) {
|
|
fprintf(output,"%s",str);
|
|
break; /* done */
|
|
} /* if */
|
|
} /* while */
|
|
} /* while - !feof(input) */
|
|
}
|
|
|
|
static void usage(void)
|
|
{
|
|
printf("Usage: scpack <filename> [output file]\n");
|
|
exit(1);
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
FILE *in, *out;
|
|
unsigned char *buffer;
|
|
unsigned buffersize, orgbuffersize;
|
|
unsigned char pairtable[128][2];
|
|
|
|
if (argc < 2 || argc > 3)
|
|
usage();
|
|
if ((in=fopen(argv[1],"rt"))==NULL) {
|
|
printf("SCPACK: error opening input %s\n",argv[1]);
|
|
usage();
|
|
} /* if */
|
|
if (argc == 2) {
|
|
if ((out=fopen(TEMPFILE,"wt"))==NULL) {
|
|
printf("SCPACK: error opening temporary file %s\n",TEMPFILE);
|
|
usage();
|
|
} /* if */
|
|
} else {
|
|
if ((out=fopen(argv[2],"wt"))==NULL) {
|
|
printf("SCPACK: error opening output file %s\n",argv[2]);
|
|
usage();
|
|
} /* if */
|
|
} /* if */
|
|
|
|
buffer = (unsigned char *)malloc(MAXSIZE);
|
|
if (buffer == NULL) {
|
|
printf("SCPACK: error allocating memory\n");
|
|
return 1;
|
|
} /* if */
|
|
/* 1. read the buffer
|
|
* 2. compress the buffer
|
|
* 3. copy the file, insert the compressed buffer
|
|
*/
|
|
buffersize = readbuffer(in, buffer);
|
|
orgbuffersize = buffersize;
|
|
if (buffersize > 0) {
|
|
buffersize = compress(buffer, buffersize, pairtable);
|
|
writefile(in, out, buffer, buffersize, pairtable);
|
|
printf("SCPACK: compression ratio: %ld%% (%d -> %d)\n",
|
|
100L-(100L*buffersize)/orgbuffersize, orgbuffersize, buffersize);
|
|
} else {
|
|
printf("SCPACK: no SCPACK section found, nothing to do\n");
|
|
} /* if */
|
|
fclose(out);
|
|
fclose(in);
|
|
/* let the new file replace the old file */
|
|
if (buffersize == 0) {
|
|
if (argc == 2)
|
|
remove(TEMPFILE);
|
|
else
|
|
remove(argv[2]);
|
|
} else if (argc == 2) {
|
|
remove(argv[1]);
|
|
rename(TEMPFILE,argv[1]);
|
|
} /* if */
|
|
return 0;
|
|
}
|