451 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			451 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* compress.c -- Byte Pair Encoding compression */
 | 
						|
/* Copyright 1996 Philip Gage */
 | 
						|
 | 
						|
/* This program appeared in the September 1997 issue of
 | 
						|
 * C/C++ Users Journal. The original source code may still
 | 
						|
 * be found at the web site of the magazine (www.cuj.com).
 | 
						|
 *
 | 
						|
 * It has been modified by me (Thiadmer Riemersma) to
 | 
						|
 * compress only a section of the input file and to store
 | 
						|
 * the compressed output along with the input as "C" strings.
 | 
						|
 *
 | 
						|
 * Compiling instructions:
 | 
						|
 *  Borland C++ 16-bit (large memory model is required):
 | 
						|
 *      bcc -ml scpack.c
 | 
						|
 *
 | 
						|
 *  Watcom C/C++ 32-bit:
 | 
						|
 *      wcl386 scpack.c
 | 
						|
 *
 | 
						|
 *  GNU C (Linux), 32-bit:
 | 
						|
 *      gcc scpack.c -o scpack
 | 
						|
 */
 | 
						|
 | 
						|
#include <assert.h>
 | 
						|
#include <limits.h>
 | 
						|
#include <stdio.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <string.h>
 | 
						|
 | 
						|
#if UINT_MAX > 0xFFFFU
 | 
						|
  #define MAXSIZE 1024*1024L
 | 
						|
#else
 | 
						|
  #define MAXSIZE UINT_MAX      /* Input file buffer size */
 | 
						|
#endif
 | 
						|
#define HASHSIZE 8192   /* Hash table size, power of 2 */
 | 
						|
#define THRESHOLD   3   /* Increase for speed, min 3 */
 | 
						|
 | 
						|
#define START_TOKEN "#ifdef SCPACK" /* start reading the buffer here */
 | 
						|
#define NAME_TOKEN  "#define SCPACK_TABLE"
 | 
						|
#define SEP_TOKEN   "#define SCPACK_SEPARATOR"
 | 
						|
#define TERM_TOKEN  "#define SCPACK_TERMINATOR"
 | 
						|
#define TEMPFILE    "~SCPACK.TMP"
 | 
						|
static char tablename[32+1] = "scpack_table";
 | 
						|
static char separator[16]=",";
 | 
						|
static char terminator[16]="";
 | 
						|
 | 
						|
int compress(unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
 | 
						|
{
 | 
						|
  unsigned char *left, *right, *count;
 | 
						|
  unsigned char a, b, bestcount;
 | 
						|
  unsigned i, j, index, bestindex, code=128;
 | 
						|
 | 
						|
  /* Dynamically allocate buffers and check for errors */
 | 
						|
  left = (unsigned char *)malloc(HASHSIZE);
 | 
						|
  right = (unsigned char *)malloc(HASHSIZE);
 | 
						|
  count = (unsigned char *)malloc(HASHSIZE);
 | 
						|
  if (left==NULL || right==NULL || count==NULL) {
 | 
						|
    printf("Error allocating memory\n");
 | 
						|
    exit(1);
 | 
						|
  }
 | 
						|
 | 
						|
  /* Check for errors */
 | 
						|
  for (i=0; i<buffersize; i++)
 | 
						|
    if (buffer[i] > 127) {
 | 
						|
      printf("This program works only on text files (7-bit ASCII)\n");
 | 
						|
      exit(1);
 | 
						|
    }
 | 
						|
 | 
						|
  memset(pairtable, 0, 128*2*sizeof(char));
 | 
						|
 | 
						|
  do {  /* Replace frequent pairs with bytes 128..255 */
 | 
						|
 | 
						|
    /* Enter counts of all byte pairs into hash table */
 | 
						|
    memset(count,0,HASHSIZE);
 | 
						|
    for (i=0; i<buffersize-1; i++) {
 | 
						|
      a = buffer[i];
 | 
						|
      b = buffer[i+1];
 | 
						|
      /* ignore any pair with a '\0' */
 | 
						|
      if (a == 0 || b == 0)
 | 
						|
        continue;
 | 
						|
      index = (a ^ (b << 6)) & (HASHSIZE-1);
 | 
						|
      while ((left[index] != a || right[index] != b) &&
 | 
						|
             count[index] != 0)
 | 
						|
        index = (index + 1) & (HASHSIZE-1);
 | 
						|
      left[index] = a;
 | 
						|
      right[index] = b;
 | 
						|
      if (count[index] < 255)
 | 
						|
        count[index] += (unsigned char)1;
 | 
						|
    }
 | 
						|
 | 
						|
    /* Search hash table for most frequent pair */
 | 
						|
    bestcount = THRESHOLD - 1;
 | 
						|
    for (i=0; i<HASHSIZE; i++) {
 | 
						|
      if (count[i] > bestcount) {
 | 
						|
        bestcount = count[i];
 | 
						|
        bestindex = i;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    /* Compress if enough occurrences of pair */
 | 
						|
    if (bestcount >= THRESHOLD) {
 | 
						|
 | 
						|
      /* Add pair to table using code as index */
 | 
						|
      a = pairtable[code-128][0] = left[bestindex];
 | 
						|
      b = pairtable[code-128][1] = right[bestindex];
 | 
						|
 | 
						|
      /* Replace all pair occurrences with unused byte */
 | 
						|
      for (i=0, j=0; i<buffersize; i++, j++)
 | 
						|
        if (a == buffer[i] && b == buffer[i+1]) {
 | 
						|
          buffer[j] = (unsigned char)code;
 | 
						|
          ++i;
 | 
						|
        }
 | 
						|
        else
 | 
						|
          buffer[j] = buffer[i];
 | 
						|
      buffersize = j;
 | 
						|
    }
 | 
						|
    else
 | 
						|
      break;
 | 
						|
  } while (++code < 255);
 | 
						|
 | 
						|
  /* done */
 | 
						|
  free(left); free(right); free(count);
 | 
						|
  return buffersize;  /* return adjusted buffersize */
 | 
						|
}
 | 
						|
 | 
						|
static int strmatch(char *str, char *token, int *indent)
 | 
						|
{
 | 
						|
  int i = 0;
 | 
						|
 | 
						|
  /* skip whitespace */
 | 
						|
  while (*str==' ' || *str=='\t') {
 | 
						|
    str++;
 | 
						|
    i++;
 | 
						|
  } /* while */
 | 
						|
  if (strncmp(str,token,strlen(token))!=0)
 | 
						|
    return 0;
 | 
						|
  if (indent != NULL)
 | 
						|
    *indent = i;
 | 
						|
  return 1;
 | 
						|
}
 | 
						|
 | 
						|
static void check_if(char *str,int linenr)
 | 
						|
{
 | 
						|
  if (strmatch(str,"#if",NULL)) {
 | 
						|
    printf("Error: \"#if...\" preprocessor statement should not be in SCPACK section "
 | 
						|
           "(line %d)\n", linenr);
 | 
						|
    exit(1);
 | 
						|
  } /* if */
 | 
						|
}
 | 
						|
 | 
						|
static int check_tablename(char *str)
 | 
						|
{
 | 
						|
  int i;
 | 
						|
 | 
						|
  if (strmatch(str,NAME_TOKEN,NULL)) {
 | 
						|
    str += strlen(NAME_TOKEN);
 | 
						|
    while (*str==' ' || *str=='\t')
 | 
						|
      str++;
 | 
						|
    for (i=0; i<(sizeof tablename - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | 
						|
      tablename[i] = *str;
 | 
						|
    tablename[i] = '\0';
 | 
						|
    return 1;
 | 
						|
  } /* if */
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int check_separator(char *str)
 | 
						|
{
 | 
						|
  int i;
 | 
						|
 | 
						|
  if (strmatch(str,SEP_TOKEN,NULL)) {
 | 
						|
    str += strlen(SEP_TOKEN);
 | 
						|
    while (*str==' ' || *str=='\t')
 | 
						|
      str++;
 | 
						|
    for (i=0; i<(sizeof separator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | 
						|
      separator[i] = *str;
 | 
						|
    separator[i] = '\0';
 | 
						|
    return 1;
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  if (strmatch(str,TERM_TOKEN,NULL)) {
 | 
						|
    str += strlen(TERM_TOKEN);
 | 
						|
    while (*str==' ' || *str=='\t')
 | 
						|
      str++;
 | 
						|
    for (i=0; i<(sizeof terminator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
 | 
						|
      terminator[i] = *str;
 | 
						|
    terminator[i] = '\0';
 | 
						|
    return 1;
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
/* readbuffer
 | 
						|
 * Reads in the input file and stores all strings in the
 | 
						|
 * section between "#ifdef SCPACK" and "#else" in a buffer.
 | 
						|
 * Only text that is between double quotes is added to the
 | 
						|
 * buffer; the \" escape code is handled. Multiple strings
 | 
						|
 * on one line are handled.
 | 
						|
 */
 | 
						|
unsigned readbuffer(FILE *input, unsigned char *buffer)
 | 
						|
{
 | 
						|
  char str[256];
 | 
						|
  unsigned buffersize;
 | 
						|
  int i,linenr;
 | 
						|
 | 
						|
  linenr=0;
 | 
						|
  buffersize=0;
 | 
						|
 | 
						|
  rewind(input);
 | 
						|
  while (!feof(input)) {
 | 
						|
    while (fgets(str,sizeof str,input)!=NULL) {
 | 
						|
      linenr++;
 | 
						|
      check_tablename(str);
 | 
						|
      check_separator(str);
 | 
						|
      if (strmatch(str,START_TOKEN,NULL))
 | 
						|
        break;
 | 
						|
    } /* while */
 | 
						|
    if (!strmatch(str,START_TOKEN,NULL))
 | 
						|
      return buffersize;  /* no (more) section found, quit */
 | 
						|
 | 
						|
    while (fgets(str,sizeof str,input)!=NULL) {
 | 
						|
      linenr++;
 | 
						|
      check_if(str,linenr);
 | 
						|
      if (check_tablename(str))
 | 
						|
        printf("Error: table name definition should not be in SCPACK section (line %d)\n", linenr);
 | 
						|
      check_separator(str);
 | 
						|
      if (strmatch(str,"#else",NULL))
 | 
						|
        break;          /* done */
 | 
						|
      /* add to the buffer only what is between double quotes */
 | 
						|
      i=0;
 | 
						|
      do {
 | 
						|
        while (str[i]!='\0' && str[i]!='"')
 | 
						|
          i++;
 | 
						|
        if (str[i]=='"') {
 | 
						|
          /* we are in a string */
 | 
						|
          i++;
 | 
						|
          while (str[i]!='\0' && str[i]!='"') {
 | 
						|
            /* handle escape sequences */
 | 
						|
            if (str[i]=='\\') {
 | 
						|
              i++;
 | 
						|
              switch (str[i]) {
 | 
						|
              case 'a': /* alarm */
 | 
						|
                buffer[buffersize++]='\a';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case 'b': /* backspace */
 | 
						|
                buffer[buffersize++]='\b';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case 'f': /* form feed */
 | 
						|
                buffer[buffersize++]='\f';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case 'n': /* newline */
 | 
						|
                buffer[buffersize++]='\n';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case 'r': /* carriage return */
 | 
						|
                buffer[buffersize++]='\n';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case 't': /* tab */
 | 
						|
                buffer[buffersize++]='\t';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case '\'':
 | 
						|
                buffer[buffersize++]='\'';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              case '"':
 | 
						|
                buffer[buffersize++]='"';
 | 
						|
                i++;
 | 
						|
                break;
 | 
						|
              default:
 | 
						|
                // ??? octal character code escapes and hexadecimal escapes
 | 
						|
                //     not supported
 | 
						|
                printf("Unknown escape sequence '\\%c' on line %d\n",
 | 
						|
                       str[i], linenr);
 | 
						|
              } /* switch */
 | 
						|
            } else {
 | 
						|
              buffer[buffersize++]=str[i++];
 | 
						|
            } /* if */
 | 
						|
          } /* while */
 | 
						|
          if (str[i]=='"') {
 | 
						|
            buffer[buffersize++]='\0'; /* terminate each string */
 | 
						|
            i++;
 | 
						|
          } else {
 | 
						|
            printf("Error: unterminated string on line %d\n",linenr);
 | 
						|
          } /* if */
 | 
						|
        } /* if */
 | 
						|
      } while (str[i]!='\0');
 | 
						|
    } /* while - in SCPACK section */
 | 
						|
    /* put in another '\0' to terminate the section */
 | 
						|
    buffer[buffersize++]='\0';
 | 
						|
  } /* while - !feof(input) */
 | 
						|
  return buffersize;
 | 
						|
}
 | 
						|
 | 
						|
static void write_pairtable(FILE *output, unsigned char pairtable[128][2], char *tablename)
 | 
						|
{
 | 
						|
  int i;
 | 
						|
 | 
						|
  /* dump the pair table */
 | 
						|
  fprintf(output, "/*-*SCPACK start of pair table, do not change or remove this line */\n");
 | 
						|
  fprintf(output, "unsigned char %s[][2] = {", tablename);
 | 
						|
  for (i=0; i<128 && pairtable[i][0]!=0 && pairtable[i][1]!=0; i++) {
 | 
						|
    if ((i % 16)==0)
 | 
						|
      fprintf(output, "\n  ");
 | 
						|
    else
 | 
						|
      fprintf(output, " ");
 | 
						|
    fprintf(output, "{%d,%d}", pairtable[i][0], pairtable[i][1]);
 | 
						|
    /* check if something follows this pair */
 | 
						|
    if (i+1<128 && pairtable[i+1][0]!=0 && pairtable[i+1][1]!=0)
 | 
						|
      fprintf(output, ",");
 | 
						|
  } /* for */
 | 
						|
  fprintf(output, "\n};\n");
 | 
						|
  fprintf(output, "/*-*SCPACK end of pair table, do not change or remove this line */\n");
 | 
						|
}
 | 
						|
 | 
						|
void writefile(FILE *input, FILE *output, unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
 | 
						|
{
 | 
						|
  char str[256];
 | 
						|
  int insection, indent, needseparator;
 | 
						|
  unsigned char *bufptr;
 | 
						|
 | 
						|
  bufptr = buffer;
 | 
						|
  insection = 0;
 | 
						|
 | 
						|
  rewind(input);
 | 
						|
  while (!feof(input)) {
 | 
						|
    while (fgets(str,sizeof str,input)!=NULL) {
 | 
						|
      fprintf(output,"%s",str);
 | 
						|
      if (check_tablename(str)) {
 | 
						|
        write_pairtable(output, pairtable, tablename);
 | 
						|
        /* strip an existing pair table from the file */
 | 
						|
        if (fgets(str,sizeof str,input)!=NULL) {
 | 
						|
          if (strmatch(str,"/*-*SCPACK",NULL)) {
 | 
						|
            while (fgets(str,sizeof str,input)!=NULL)
 | 
						|
              if (strmatch(str,"/*-*SCPACK",NULL))
 | 
						|
                break;
 | 
						|
          } else {
 | 
						|
            fprintf(output,"%s",str);
 | 
						|
          } /* if */
 | 
						|
        } /* if */
 | 
						|
      } /* if */
 | 
						|
      if (strmatch(str,START_TOKEN,NULL))
 | 
						|
        insection = 1;
 | 
						|
      if (insection && strmatch(str,"#else",NULL))
 | 
						|
        break;
 | 
						|
    } /* while */
 | 
						|
    if (!strmatch(str,"#else",&indent))
 | 
						|
      return;           /* no (more) section found, quit */
 | 
						|
    insection=0;
 | 
						|
 | 
						|
    /* dump the buffer as strings, separated with commas */
 | 
						|
    needseparator = 0;
 | 
						|
    while (*bufptr != '\0') {
 | 
						|
      assert((unsigned)(bufptr-buffer) < buffersize);
 | 
						|
      if (needseparator)
 | 
						|
        fprintf(output, "%s\n",separator);
 | 
						|
      fprintf(output, "%*c\"",indent+2,' ');
 | 
						|
      /* loop over string */
 | 
						|
      while (*bufptr != '\0') {
 | 
						|
        if (*bufptr<' ' || *bufptr >= 128 || *bufptr == '"' || *bufptr == '\\')
 | 
						|
          fprintf(output, "\\%03o", *bufptr);
 | 
						|
        else
 | 
						|
          fprintf(output, "%c", *bufptr);
 | 
						|
        bufptr++;
 | 
						|
      } /* while */
 | 
						|
      fprintf(output, "\"");
 | 
						|
      needseparator = 1;
 | 
						|
      bufptr++;           /* skip '\0' */
 | 
						|
    } /* while */
 | 
						|
    fprintf(output, "%s\n",terminator);
 | 
						|
    bufptr++;
 | 
						|
 | 
						|
    /* skip the input file until the #endif section */
 | 
						|
    while (fgets(str,sizeof str,input)!=NULL) {
 | 
						|
      if (strmatch(str,"#endif",NULL)) {
 | 
						|
        fprintf(output,"%s",str);
 | 
						|
        break;          /* done */
 | 
						|
      } /* if */
 | 
						|
    } /* while */
 | 
						|
  } /* while - !feof(input) */
 | 
						|
}
 | 
						|
 | 
						|
static void usage(void)
 | 
						|
{
 | 
						|
  printf("Usage: scpack <filename> [output file]\n");
 | 
						|
  exit(1);
 | 
						|
}
 | 
						|
 | 
						|
int main(int argc, char **argv)
 | 
						|
{
 | 
						|
  FILE *in, *out;
 | 
						|
  unsigned char *buffer;
 | 
						|
  unsigned buffersize, orgbuffersize;
 | 
						|
  unsigned char pairtable[128][2];
 | 
						|
 | 
						|
  if (argc < 2 || argc > 3)
 | 
						|
    usage();
 | 
						|
  if ((in=fopen(argv[1],"rt"))==NULL) {
 | 
						|
    printf("SCPACK: error opening input %s\n",argv[1]);
 | 
						|
    usage();
 | 
						|
  } /* if */
 | 
						|
  if (argc == 2) {
 | 
						|
    if ((out=fopen(TEMPFILE,"wt"))==NULL) {
 | 
						|
      printf("SCPACK: error opening temporary file %s\n",TEMPFILE);
 | 
						|
      usage();
 | 
						|
    } /* if */
 | 
						|
  } else {
 | 
						|
    if ((out=fopen(argv[2],"wt"))==NULL) {
 | 
						|
      printf("SCPACK: error opening output file %s\n",argv[2]);
 | 
						|
      usage();
 | 
						|
    } /* if */
 | 
						|
  } /* if */
 | 
						|
 | 
						|
  buffer = (unsigned char *)malloc(MAXSIZE);
 | 
						|
  if (buffer == NULL) {
 | 
						|
    printf("SCPACK: error allocating memory\n");
 | 
						|
    return 1;
 | 
						|
  } /* if */
 | 
						|
  /* 1. read the buffer
 | 
						|
   * 2. compress the buffer
 | 
						|
   * 3. copy the file, insert the compressed buffer
 | 
						|
   */
 | 
						|
  buffersize = readbuffer(in, buffer);
 | 
						|
  orgbuffersize = buffersize;
 | 
						|
  if (buffersize > 0) {
 | 
						|
    buffersize = compress(buffer, buffersize, pairtable);
 | 
						|
    writefile(in, out, buffer, buffersize, pairtable);
 | 
						|
    printf("SCPACK: compression ratio: %ld%% (%d -> %d)\n",
 | 
						|
           100L-(100L*buffersize)/orgbuffersize, orgbuffersize, buffersize);
 | 
						|
  } else {
 | 
						|
    printf("SCPACK: no SCPACK section found, nothing to do\n");
 | 
						|
  } /* if */
 | 
						|
  fclose(out);
 | 
						|
  fclose(in);
 | 
						|
  /* let the new file replace the old file */
 | 
						|
  if (buffersize == 0) {
 | 
						|
    if (argc == 2)
 | 
						|
      remove(TEMPFILE);
 | 
						|
    else
 | 
						|
      remove(argv[2]);
 | 
						|
  } else if (argc == 2) {
 | 
						|
    remove(argv[1]);
 | 
						|
    rename(TEMPFILE,argv[1]);
 | 
						|
  } /* if */
 | 
						|
  return 0;
 | 
						|
}
 |