451 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			451 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* compress.c -- Byte Pair Encoding compression */
 | |
| /* Copyright 1996 Philip Gage */
 | |
| 
 | |
| /* This program appeared in the September 1997 issue of
 | |
|  * C/C++ Users Journal. The original source code may still
 | |
|  * be found at the web site of the magazine (www.cuj.com).
 | |
|  *
 | |
|  * It has been modified by me (Thiadmer Riemersma) to
 | |
|  * compress only a section of the input file and to store
 | |
|  * the compressed output along with the input as "C" strings.
 | |
|  *
 | |
|  * Compiling instructions:
 | |
|  *  Borland C++ 16-bit (large memory model is required):
 | |
|  *      bcc -ml scpack.c
 | |
|  *
 | |
|  *  Watcom C/C++ 32-bit:
 | |
|  *      wcl386 scpack.c
 | |
|  *
 | |
|  *  GNU C (Linux), 32-bit:
 | |
|  *      gcc scpack.c -o scpack
 | |
|  */
 | |
| 
 | |
| #include <assert.h>
 | |
| #include <limits.h>
 | |
| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include <string.h>
 | |
| 
 | |
| #if UINT_MAX > 0xFFFFU
 | |
|   #define MAXSIZE 1024*1024L
 | |
| #else
 | |
|   #define MAXSIZE UINT_MAX      /* Input file buffer size */
 | |
| #endif
 | |
| #define HASHSIZE 8192   /* Hash table size, power of 2 */
 | |
| #define THRESHOLD   3   /* Increase for speed, min 3 */
 | |
| 
 | |
| #define START_TOKEN "#ifdef SCPACK" /* start reading the buffer here */
 | |
| #define NAME_TOKEN  "#define SCPACK_TABLE"
 | |
| #define SEP_TOKEN   "#define SCPACK_SEPARATOR"
 | |
| #define TERM_TOKEN  "#define SCPACK_TERMINATOR"
 | |
| #define TEMPFILE    "~SCPACK.TMP"
 | |
| static char tablename[32+1] = "scpack_table";
 | |
| static char separator[16]=",";
 | |
| static char terminator[16]="";
 | |
| 
 | |
| int compress(unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
 | |
| {
 | |
|   unsigned char *left, *right, *count;
 | |
|   unsigned char a, b, bestcount;
 | |
|   unsigned i, j, index, bestindex, code=128;
 | |
| 
 | |
|   /* Dynamically allocate buffers and check for errors */
 | |
|   left = (unsigned char *)malloc(HASHSIZE);
 | |
|   right = (unsigned char *)malloc(HASHSIZE);
 | |
|   count = (unsigned char *)malloc(HASHSIZE);
 | |
|   if (left==NULL || right==NULL || count==NULL) {
 | |
|     printf("Error allocating memory\n");
 | |
|     exit(1);
 | |
|   }
 | |
| 
 | |
|   /* Check for errors */
 | |
|   for (i=0; i<buffersize; i++)
 | |
|     if (buffer[i] > 127) {
 | |
|       printf("This program works only on text files (7-bit ASCII)\n");
 | |
|       exit(1);
 | |
|     }
 | |
| 
 | |
|   memset(pairtable, 0, 128*2*sizeof(char));
 | |
| 
 | |
|   do {  /* Replace frequent pairs with bytes 128..255 */
 | |
| 
 | |
|     /* Enter counts of all byte pairs into hash table */
 | |
|     memset(count,0,HASHSIZE);
 | |
|     for (i=0; i<buffersize-1; i++) {
 | |
|       a = buffer[i];
 | |
|       b = buffer[i+1];
 | |
|       /* ignore any pair with a '\0' */
 | |
|       if (a == 0 || b == 0)
 | |
|         continue;
 | |
|       index = (a ^ (b << 6)) & (HASHSIZE-1);
 | |
|       while ((left[index] != a || right[index] != b) &&
 | |
|              count[index] != 0)
 | |
|         index = (index + 1) & (HASHSIZE-1);
 | |
|       left[index] = a;
 | |
|       right[index] = b;
 | |
|       if (count[index] < 255)
 | |
|         count[index] += (unsigned char)1;
 | |
|     }
 | |
| 
 | |
|     /* Search hash table for most frequent pair */
 | |
|     bestcount = THRESHOLD - 1;
 | |
|     for (i=0; i<HASHSIZE; i++) {
 | |
|       if (count[i] > bestcount) {
 | |
|         bestcount = count[i];
 | |
|         bestindex = i;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     /* Compress if enough occurrences of pair */
 | |
|     if (bestcount >= THRESHOLD) {
 | |
| 
 | |
|       /* Add pair to table using code as index */
 | |
|       a = pairtable[code-128][0] = left[bestindex];
 | |
|       b = pairtable[code-128][1] = right[bestindex];
 | |
| 
 | |
|       /* Replace all pair occurrences with unused byte */
 | |
|       for (i=0, j=0; i<buffersize; i++, j++)
 | |
|         if (a == buffer[i] && b == buffer[i+1]) {
 | |
|           buffer[j] = (unsigned char)code;
 | |
|           ++i;
 | |
|         }
 | |
|         else
 | |
|           buffer[j] = buffer[i];
 | |
|       buffersize = j;
 | |
|     }
 | |
|     else
 | |
|       break;
 | |
|   } while (++code < 255);
 | |
| 
 | |
|   /* done */
 | |
|   free(left); free(right); free(count);
 | |
|   return buffersize;  /* return adjusted buffersize */
 | |
| }
 | |
| 
 | |
| static int strmatch(char *str, char *token, int *indent)
 | |
| {
 | |
|   int i = 0;
 | |
| 
 | |
|   /* skip whitespace */
 | |
|   while (*str==' ' || *str=='\t') {
 | |
|     str++;
 | |
|     i++;
 | |
|   } /* while */
 | |
|   if (strncmp(str,token,strlen(token))!=0)
 | |
|     return 0;
 | |
|   if (indent != NULL)
 | |
|     *indent = i;
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| static void check_if(char *str,int linenr)
 | |
| {
 | |
|   if (strmatch(str,"#if",NULL)) {
 | |
|     printf("Error: \"#if...\" preprocessor statement should not be in SCPACK section "
 | |
|            "(line %d)\n", linenr);
 | |
|     exit(1);
 | |
|   } /* if */
 | |
| }
 | |
| 
 | |
| static int check_tablename(char *str)
 | |
| {
 | |
|   int i;
 | |
| 
 | |
|   if (strmatch(str,NAME_TOKEN,NULL)) {
 | |
|     str += strlen(NAME_TOKEN);
 | |
|     while (*str==' ' || *str=='\t')
 | |
|       str++;
 | |
|     for (i=0; i<(sizeof tablename - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | |
|       tablename[i] = *str;
 | |
|     tablename[i] = '\0';
 | |
|     return 1;
 | |
|   } /* if */
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| static int check_separator(char *str)
 | |
| {
 | |
|   int i;
 | |
| 
 | |
|   if (strmatch(str,SEP_TOKEN,NULL)) {
 | |
|     str += strlen(SEP_TOKEN);
 | |
|     while (*str==' ' || *str=='\t')
 | |
|       str++;
 | |
|     for (i=0; i<(sizeof separator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | |
|       separator[i] = *str;
 | |
|     separator[i] = '\0';
 | |
|     return 1;
 | |
|   } /* if */
 | |
| 
 | |
|   if (strmatch(str,TERM_TOKEN,NULL)) {
 | |
|     str += strlen(TERM_TOKEN);
 | |
|     while (*str==' ' || *str=='\t')
 | |
|       str++;
 | |
|     for (i=0; i<(sizeof terminator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | |
|       terminator[i] = *str;
 | |
|     terminator[i] = '\0';
 | |
|     return 1;
 | |
|   } /* if */
 | |
| 
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| /* readbuffer
 | |
|  * Reads in the input file and stores all strings in the
 | |
|  * section between "#ifdef SCPACK" and "#else" in a buffer.
 | |
|  * Only text that is between double quotes is added to the
 | |
|  * buffer; the \" escape code is handled. Multiple strings
 | |
|  * on one line are handled.
 | |
|  */
 | |
| unsigned readbuffer(FILE *input, unsigned char *buffer)
 | |
| {
 | |
|   char str[256];
 | |
|   unsigned buffersize;
 | |
|   int i,linenr;
 | |
| 
 | |
|   linenr=0;
 | |
|   buffersize=0;
 | |
| 
 | |
|   rewind(input);
 | |
|   while (!feof(input)) {
 | |
|     while (fgets(str,sizeof str,input)!=NULL) {
 | |
|       linenr++;
 | |
|       check_tablename(str);
 | |
|       check_separator(str);
 | |
|       if (strmatch(str,START_TOKEN,NULL))
 | |
|         break;
 | |
|     } /* while */
 | |
|     if (!strmatch(str,START_TOKEN,NULL))
 | |
|       return buffersize;  /* no (more) section found, quit */
 | |
| 
 | |
|     while (fgets(str,sizeof str,input)!=NULL) {
 | |
|       linenr++;
 | |
|       check_if(str,linenr);
 | |
|       if (check_tablename(str))
 | |
|         printf("Error: table name definition should not be in SCPACK section (line %d)\n", linenr);
 | |
|       check_separator(str);
 | |
|       if (strmatch(str,"#else",NULL))
 | |
|         break;          /* done */
 | |
|       /* add to the buffer only what is between double quotes */
 | |
|       i=0;
 | |
|       do {
 | |
|         while (str[i]!='\0' && str[i]!='"')
 | |
|           i++;
 | |
|         if (str[i]=='"') {
 | |
|           /* we are in a string */
 | |
|           i++;
 | |
|           while (str[i]!='\0' && str[i]!='"') {
 | |
|             /* handle escape sequences */
 | |
|             if (str[i]=='\\') {
 | |
|               i++;
 | |
|               switch (str[i]) {
 | |
|               case 'a': /* alarm */
 | |
|                 buffer[buffersize++]='\a';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case 'b': /* backspace */
 | |
|                 buffer[buffersize++]='\b';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case 'f': /* form feed */
 | |
|                 buffer[buffersize++]='\f';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case 'n': /* newline */
 | |
|                 buffer[buffersize++]='\n';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case 'r': /* carriage return */
 | |
|                 buffer[buffersize++]='\n';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case 't': /* tab */
 | |
|                 buffer[buffersize++]='\t';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case '\'':
 | |
|                 buffer[buffersize++]='\'';
 | |
|                 i++;
 | |
|                 break;
 | |
|               case '"':
 | |
|                 buffer[buffersize++]='"';
 | |
|                 i++;
 | |
|                 break;
 | |
|               default:
 | |
|                 // ??? octal character code escapes and hexadecimal escapes
 | |
|                 //     not supported
 | |
|                 printf("Unknown escape sequence '\\%c' on line %d\n",
 | |
|                        str[i], linenr);
 | |
|               } /* switch */
 | |
|             } else {
 | |
|               buffer[buffersize++]=str[i++];
 | |
|             } /* if */
 | |
|           } /* while */
 | |
|           if (str[i]=='"') {
 | |
|             buffer[buffersize++]='\0'; /* terminate each string */
 | |
|             i++;
 | |
|           } else {
 | |
|             printf("Error: unterminated string on line %d\n",linenr);
 | |
|           } /* if */
 | |
|         } /* if */
 | |
|       } while (str[i]!='\0');
 | |
|     } /* while - in SCPACK section */
 | |
|     /* put in another '\0' to terminate the section */
 | |
|     buffer[buffersize++]='\0';
 | |
|   } /* while - !feof(input) */
 | |
|   return buffersize;
 | |
| }
 | |
| 
 | |
| static void write_pairtable(FILE *output, unsigned char pairtable[128][2], char *tablename)
 | |
| {
 | |
|   int i;
 | |
| 
 | |
|   /* dump the pair table */
 | |
|   fprintf(output, "/*-*SCPACK start of pair table, do not change or remove this line */\n");
 | |
|   fprintf(output, "unsigned char %s[][2] = {", tablename);
 | |
|   for (i=0; i<128 && pairtable[i][0]!=0 && pairtable[i][1]!=0; i++) {
 | |
|     if ((i % 16)==0)
 | |
|       fprintf(output, "\n  ");
 | |
|     else
 | |
|       fprintf(output, " ");
 | |
|     fprintf(output, "{%d,%d}", pairtable[i][0], pairtable[i][1]);
 | |
|     /* check if something follows this pair */
 | |
|     if (i+1<128 && pairtable[i+1][0]!=0 && pairtable[i+1][1]!=0)
 | |
|       fprintf(output, ",");
 | |
|   } /* for */
 | |
|   fprintf(output, "\n};\n");
 | |
|   fprintf(output, "/*-*SCPACK end of pair table, do not change or remove this line */\n");
 | |
| }
 | |
| 
 | |
| void writefile(FILE *input, FILE *output, unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
 | |
| {
 | |
|   char str[256];
 | |
|   int insection, indent, needseparator;
 | |
|   unsigned char *bufptr;
 | |
| 
 | |
|   bufptr = buffer;
 | |
|   insection = 0;
 | |
| 
 | |
|   rewind(input);
 | |
|   while (!feof(input)) {
 | |
|     while (fgets(str,sizeof str,input)!=NULL) {
 | |
|       fprintf(output,"%s",str);
 | |
|       if (check_tablename(str)) {
 | |
|         write_pairtable(output, pairtable, tablename);
 | |
|         /* strip an existing pair table from the file */
 | |
|         if (fgets(str,sizeof str,input)!=NULL) {
 | |
|           if (strmatch(str,"/*-*SCPACK",NULL)) {
 | |
|             while (fgets(str,sizeof str,input)!=NULL)
 | |
|               if (strmatch(str,"/*-*SCPACK",NULL))
 | |
|                 break;
 | |
|           } else {
 | |
|             fprintf(output,"%s",str);
 | |
|           } /* if */
 | |
|         } /* if */
 | |
|       } /* if */
 | |
|       if (strmatch(str,START_TOKEN,NULL))
 | |
|         insection = 1;
 | |
|       if (insection && strmatch(str,"#else",NULL))
 | |
|         break;
 | |
|     } /* while */
 | |
|     if (!strmatch(str,"#else",&indent))
 | |
|       return;           /* no (more) section found, quit */
 | |
|     insection=0;
 | |
| 
 | |
|     /* dump the buffer as strings, separated with commas */
 | |
|     needseparator = 0;
 | |
|     while (*bufptr != '\0') {
 | |
|       assert((unsigned)(bufptr-buffer) < buffersize);
 | |
|       if (needseparator)
 | |
|         fprintf(output, "%s\n",separator);
 | |
|       fprintf(output, "%*c\"",indent+2,' ');
 | |
|       /* loop over string */
 | |
|       while (*bufptr != '\0') {
 | |
|         if (*bufptr<' ' || *bufptr >= 128 || *bufptr == '"' || *bufptr == '\\')
 | |
|           fprintf(output, "\\%03o", *bufptr);
 | |
|         else
 | |
|           fprintf(output, "%c", *bufptr);
 | |
|         bufptr++;
 | |
|       } /* while */
 | |
|       fprintf(output, "\"");
 | |
|       needseparator = 1;
 | |
|       bufptr++;           /* skip '\0' */
 | |
|     } /* while */
 | |
|     fprintf(output, "%s\n",terminator);
 | |
|     bufptr++;
 | |
| 
 | |
|     /* skip the input file until the #endif section */
 | |
|     while (fgets(str,sizeof str,input)!=NULL) {
 | |
|       if (strmatch(str,"#endif",NULL)) {
 | |
|         fprintf(output,"%s",str);
 | |
|         break;          /* done */
 | |
|       } /* if */
 | |
|     } /* while */
 | |
|   } /* while - !feof(input) */
 | |
| }
 | |
| 
 | |
| static void usage(void)
 | |
| {
 | |
|   printf("Usage: scpack <filename> [output file]\n");
 | |
|   exit(1);
 | |
| }
 | |
| 
 | |
| int main(int argc, char **argv)
 | |
| {
 | |
|   FILE *in, *out;
 | |
|   unsigned char *buffer;
 | |
|   unsigned buffersize, orgbuffersize;
 | |
|   unsigned char pairtable[128][2];
 | |
| 
 | |
|   if (argc < 2 || argc > 3)
 | |
|     usage();
 | |
|   if ((in=fopen(argv[1],"rt"))==NULL) {
 | |
|     printf("SCPACK: error opening input %s\n",argv[1]);
 | |
|     usage();
 | |
|   } /* if */
 | |
|   if (argc == 2) {
 | |
|     if ((out=fopen(TEMPFILE,"wt"))==NULL) {
 | |
|       printf("SCPACK: error opening temporary file %s\n",TEMPFILE);
 | |
|       usage();
 | |
|     } /* if */
 | |
|   } else {
 | |
|     if ((out=fopen(argv[2],"wt"))==NULL) {
 | |
|       printf("SCPACK: error opening output file %s\n",argv[2]);
 | |
|       usage();
 | |
|     } /* if */
 | |
|   } /* if */
 | |
| 
 | |
|   buffer = (unsigned char *)malloc(MAXSIZE);
 | |
|   if (buffer == NULL) {
 | |
|     printf("SCPACK: error allocating memory\n");
 | |
|     return 1;
 | |
|   } /* if */
 | |
|   /* 1. read the buffer
 | |
|    * 2. compress the buffer
 | |
|    * 3. copy the file, insert the compressed buffer
 | |
|    */
 | |
|   buffersize = readbuffer(in, buffer);
 | |
|   orgbuffersize = buffersize;
 | |
|   if (buffersize > 0) {
 | |
|     buffersize = compress(buffer, buffersize, pairtable);
 | |
|     writefile(in, out, buffer, buffersize, pairtable);
 | |
|     printf("SCPACK: compression ratio: %ld%% (%d -> %d)\n",
 | |
|            100L-(100L*buffersize)/orgbuffersize, orgbuffersize, buffersize);
 | |
|   } else {
 | |
|     printf("SCPACK: no SCPACK section found, nothing to do\n");
 | |
|   } /* if */
 | |
|   fclose(out);
 | |
|   fclose(in);
 | |
|   /* let the new file replace the old file */
 | |
|   if (buffersize == 0) {
 | |
|     if (argc == 2)
 | |
|       remove(TEMPFILE);
 | |
|     else
 | |
|       remove(argv[2]);
 | |
|   } else if (argc == 2) {
 | |
|     remove(argv[1]);
 | |
|     rename(TEMPFILE,argv[1]);
 | |
|   } /* if */
 | |
|   return 0;
 | |
| }
 |