The Data Compression Book-:Statistical Modeling

Table of Contents
/************************* Start of ARITH-N.C **************************/
* * This is the order-n arithmetic coding module used in the final
* part of chapter 6.
*
* Compile with BITIO.C. ERRHAND.C, and either MAIN-C.C OR MAIN-E.C. This
* program should be compiled in large model.  An even better alternative
* is a DOS extender.
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "errhand.h"
#include "bitio.h"

/*
* The SYMBOL structure is what is used to defined a symbol in
* arithmetic coding terms.  A symbol is defined as a range between
* 0 and 1.  Since we are using integer math, instead of using 0 and 1
* as our end points, we have an integer scale.  The low_count and
* high_count define where the symbol falls in the range.
*/

typedef struct {
     unsigned short int low_count;
     unsigned short int high_count;
     unsigned short int scale;
} SYMBOL;

#define MAXIMUM_SCALE 16383  /* Maximum allowed frequency count */
#define ESCAPE   256         /* The escape symbol               */
#define DONE (-1)            /* The output stream empty  symbol */
#define FLUSH  (-2)          /* The symbol to flush the model   */

/*
*  Function prototypes.
*/

#ifdef __STDC__

void initialize_options( int argc, char **argv );
int check_compression( FILE *input, BIT_FILE *output );
void initialize_model( void );
void update_model( int symbol );
int convert_int_to_symbol( int symbol, SYMBOL *s );
void get_symbol_scale( SYMBOL *s );
int convert_symbol_to_int( int count, SYMBOL *s );
void add_character_to_model( int c );
void flush_model( void );
void initialize_arithmetic_decoder( BIT_FILE *stream ) ;
void remove_symbol_from_stream( BIT_FILE *stream, SYMBOL *s );
void initialize_arithmetic_encoder( void );
void encode_symbol( BIT_FILE *stream, SYMBOL *s );
void flush_arithmetic_encoder( BIT_FILE *stream );
short int get_current_count( SYMBOL *s );

#else

void initialize_options();
int check_compression();
void initialize_model();
void update_model();
int convert_int_to_symbol();
void get_symbol_scale();
int convert_symbol_to_int();
void add_character_to_model();
void flush_model();
void initialize_arithmetic_decoder();
void remove_symbol_from_stream();
void initialize_arithmetic_encoder();
void encode_symbol();
void flush_arithmetic_encoder();
short int get_current_count();

#endif

char *CompressionName = "Adaptive order n model with arithmetic coding";
char *Usage           = "in-file out-file [ -o order ]\n\n";
int max_order         = 3;

/*
* The main procedure is similar to the main found in ARITH1E.C.  It has
* to initialize the coder and the model.  It then sits in a loop reading
* input symbols and encoding them.  One difference is that every 256
* symbols a compression check is performed.  If the compression ratio
* falls below 10%, a flush character is encoded.  This flushes the encod
* ing model, and will cause the decoder to flush its model when the
* file is being expanded.  The second difference is that each symbol is
* repeatedly encoded until a successful encoding occurs.  When trying to
* encode a character in a particular order, the model may have to
* transmit an ESCAPE character.  If this is the case, the character has
* to be retransmitted using a lower order.  This process repeats until a
* successful match is found of the symbol in a particular context.
* Usually this means going down no further than the order -1 model.
* However, the FLUSH and DONE symbols drop back to the order -2 model.
*
*/

void CompressFile( input, output, argc, argv )
FILE *input;
BIT_FILE *output;

int argc;
char *argv[];
{
     SYMBOL s;
     int c;
     int escaped;
     int flush = 0;
     long int text_count = 0;
     initialize_options( argc, argv );
     initialize_model();
     initialize-arithmetic_encoder();
     for ( ; ; ) {
       if ( ( ++text_count & 0x0ff ) == 0 )
         flush = check_compression( input, output );
       if ( !flush )
         c = getc( input );
       else
         c = FLUSH;
       if ( c == EOF )
         c = DONE;
       do {
         escaped = convert_int_to_symbol( c, &s);
         encode_symbol( output, &s );
       } while ( escaped );
       if ( c == DONE )
         break;
       if ( c == FLUSH ) {
         flush_model();
         flush = 0;
       }
       update_model( c );
       add_character_to_model( c );
     }
     flush_arithmetic_encoder( output );
}
/*
* The main loop for expansion is very similar to the expansion
* routine used in the simpler compression program, ARITH1E.C.  The
* routine first has to initialize the the arithmetic coder and the
* model.  The decompression loop differs in a couple of respect.
* First of all, it handles the special ESCAPE character, by
* removing them from the input bit stream but just throwing them
* away otherwise.  Secondly, it handles the special FLUSH character.
* Once the main decoding loop is done, the cleanup code is called,
* and the program exits.
*
*/
void ExpandFile( input, output, argc, argv )
BIT_FILE *input;
FILE *output;
int argc;
char *argv[];
{
     SYMBOL s;
     int c;
     int count;

     initialize_options( argc, argv );
     initialize_model();
     initialize_arithmetic_decoder( input );
     for ( ; ; ) {
       do {
         get_symbol_scale( &s );
         count = get_current_count( &s );
         c = convert_symbol_to_int( count, &s );
         remove_symbol_from_stream( input, &s );
       } while ( c == ESCAPE );
       if ( c == DONE )
         break;
       if ( c != FLUSH )
         putc( (char) c, output );
       else
         flush_model();
       update_model( c );
       add_character_to_model( c );
     }
}

/*
* This routine checks for command line options.  At present, the only
* option being passed on the command line is the order.
*/

void initialize_options( argc, argv )
int argc;
char *argv[];
{
     while ( argc–– > 0 ) {
       if ( strcmp( *argv, "-o" ) == 0 ) {
         argc––;
         max_order = atoi( *++argv );
       } else
         printf( "Uknown argument on command line: %s\n", *argv );
       argc––;
       argv++;
     }
}

/*
* This routine is called once every 256 input symbols.  Its job is to
* check to see if the compression ratio falls below 10%.  If the
* output size is 90% of the input size, it means not much compression
* is taking place, so we probably ought to flush the statistics in the
* model to allow for more current statistics to have greater impact.
* This heuristic approach does seem to have some effect.
*/

int check_compression( input, output )
FILE *input;
BIT_FILE *output;
{
     static long local_input_marker = 0L;
     static long local_output_marker = 0L;
     long total_input_bytes;
     long total_output_bytes;
     int local_ratio;

     total_input_bytes = ftell( input ) - local_input_marker;
     total_output_bytes = ftell( output->file );
     total_output_bytes -= local_output_marker;
     if ( total_output_bytes == 0 )
       total_output_bytes = 1;
     local_ratio = (int)( ( total_output_bytes * 100 ) /
                                   total_input_bytes );
     local_input_marker = ftell( input );
     local_output_marker = ftell( output->file );
     return( local_ratio > 90 );
}

/*
*
* The next few routines contain all of the code and data used to
* perform modeling for this program.  This modeling unit keeps track
* of all contexts from 0 up to max_order, which defaults to 3.  In
* addition, there is a special context -1 which is a fixed model used
* to encode previously unseen characters, and a context -2 which is
* used to encode EOF and FLUSH messages.
*
* Each context is stored in a special CONTEXT structure, which is
* documented below.  Context tables are not created until the context
* is seen, and they are never destroyed.
*
*/

/*
* A context table contains a list of the counts for all symbols
* that have been seen in the defined context.  For example, a
* context of "Zor" might have only had 2 different characters
* appear.  't' might have appeared 10 times, and '1' might have
* appeared once.  These two counts are stored in the context
* table.  The counts are stored in the STATS structure.  All of
* the counts for a given context are stored in and array of STATS.
* As new characters are added to a particular contexts, the STATS
* array will grow.  Sometimes the STATS array will shrink
* after flushing the model.
*/
typedef struct {
     unsigned char symbol;
     unsigned char counts;
} STATS;

/*
* Each context has to have links to higher order contexts.  These
* links are used to navigate through the context tables.  For example,
* to find the context table for "ABC", I start at the order 0 table,
* then find the pointer to the "A" context table by looking through
* the LINKS array.  At that table, we find the "B" link and go to
* that table.  The process continues until the destination table is
* found.  The table pointed to by the LINKS array corresponds to the
* symbol found at the same offset in the STATS table.  The reason that
* LINKS is in a separate structure instead of being combined with
* STATS is to save space.  All of the leaf context nodes don't need
* next pointers, since they are in the highest order context.  In the
* leaf nodes, the LINKS array is a NULL pointer.
*/
typedef struct {
     struct context *next;
} LINKS;

/*
* The CONTEXT structure holds all of the known information about
* a particular context.  The links and stats pointers are discussed
* immediately above here.  The max_index element gives the maximum
* index that can be applied to the stats or link array.  When the
* table is first created, and stats is set to NULL, max_index is set
* to -1.  As soon as single element is added to stats, max_index is
* incremented to 0.
*
* The lesser context pointer is a navigational aid.  It points to
* the context that is one less than the current order.  For example,
* if the current context is "ABC", the lesser_context pointer will
* point to "BC".  The reason for maintaining this pointer is that
* this particular bit of table searching is done frequently, but
* the pointer only needs to be built once, when the context is
* created.
*/
typedef struct context {
     int max_index;
     LINKS *links;
     STATS *stats;
     struct context *lesser_context;
} CONTEXT;
/*
* *contexts[] is an array of current contexts.  If I want to find
* the order 0 context for the current state of the model.  I just
* look at contexts[0].  This array of context pointers is set up
* every time the model is updated.
*/
CONTEXT **contexts;

/*
* current_order contains the current order of the model.  It starts
* at max_order, and is decremented every time an ESCAPE is sent.  It
* will only go down to -1 for normal symbols, but can go to -2 for
* EOF and FLUSH.
*/
int current_order;

/*
* This table contains the cumulative totals for the current context.
* Because this program is using exclusion, totals has to be calculated
* every time a context is used.  The scoreboard array keeps track of
* symbols that have appeared in higher order models, so that they
* can be excluded from lower order context total calculations.
*/

short int totals[ 258 ];
char scoreboard[ 256 ];

/*
* Local procedure declarations for modeling routines.
*/
#ifdef __STDC__
void update_table( CONTEXT *table, int symbol );
void rescale_table( CONTEXT *table );
void totalize_table( CONTEXT *table );
CONTEXT *shift_to_next_context( CONTEXT *table, int c, int order );
CONTEXT *allocate_next_order_table( CONTEXT *table,
                                    int symbol,
                                    CONTEXT *lesser_context );
void recursive_flush( CONTEXT *table );
#else
void update_table();
void rescale_table();
void totalize_table();
CONTEXT *shift_to_next_context();
CONTEXT *allocate_next_order_table();
void recursive_flush();
#endif

/*
* This routine has to get everything set up properly so that
* the model can be maintained properly.  The first step is to create
* the *contexts[] array used later to find current context tables.
* The *contexts[] array indices go from -2 up to max_order, so
* the table needs to be fiddled with a little.  This routine then
* has to create the special order -2 and order -1 tables by hand,
* since they aren't quite like other tables.  Then the current
* context is set to \0, \0, \0, ... and the appropriate tables
* are built to support that context.  The current order is set
* to max_order, the scoreboard is cleared, and the system is
* ready to go.
*/

void initialize_model()
{
     int i;
     CONTEXT *null_table;
     CONTEXT *control_table;

     current_order = max_order;
     contexts = (CONTEXT **) calloc( sizeof( CONTEXT * ), 10 );
     if ( contexts == NULL )
       fatal_error( "Failure #1:  allocating context table!" );
     context += 2;
     null_table = (CONTEXT *) calloc( sizeof( CONTEXT ), 1 );
     if ( null_table == NULL )
       fatal_error( "Failure #2:  allocating null table!" );
     null_table->max_index = -1;
     contexts[ -1 ] = null_table;
     for ( i = 0 ; i <= max_order ; 1++ )
        contexts[ i ] = allocate_next_order_table( contexts[ i-1 ],
                                                   0,
                                                   contexts[ i-1 ] );
     free( (char *) null_table->stats );
     null_table->stats =
       (STATS &) calloc( sizeof( STATS ), 256 );
     if ( null_table->stats == NULL )
       fatal_error( "Failure #3:  allocating null table!" );
     null_table->max_index = 255;
     for ( i=0 ; i < 256 ; i++ ) {
       null_table->stats[ i ].symbol = (unsigned char) i;
       null_table->stats[ i ].counts = 1;
     }

     control_table = (CONTEXT *) calloc( sizeof(CONTEXT), 1 );
     if ( control_table == NULL )
       fatal_error( "Failure #4:  allocating null table!" );
     control_table->stats =
       (STATS *) calloc( sizeof( STATS ), 2 );
     if ( control_table->stats == NULL )
       fatal_error( "Failure #5:  allocating null table!" );
     contexts[ -2 ] = control_table;
     control_table->max_index = 1;
     control_table->stats[ 0 ].symbol = -FLUSH;
     control_table->stats[ 0 ].counts = 1;
     control_table->stats[ 1 ].symbol =– DONE;
     control_table->stats[ 1 ].counts = 1;

     for ( i = 0 ; i < 256 ; i++ )
       scoreboard[ i ] = 0;
}

/*
* This is a utility routine used to create new tables when a new
* context is created.  It gets a pointer to the current context,
* and gets the symbol that needs to be added to it.  It also needs
* a pointer to the lesser context for the table that is to be
* created.  For example, if the current context was "ABC", and the
* symbol 'D' was read in, add_character_to_model would need to
* create the new context "BCD".  This routine would get called
* with a pointer to "BC", the symbol 'D', and a pointer to context
* "CD".  This routine then creates a new table for "BCD", adds it
* to the link table for "BC", and gives "BCD" a back pointer to
* "CD".  Note that finding the lesser context is a difficult
* task, and isn't done here.  This routine mainly worries about
* modifying the stats and links fields in the current context.
*/

CONTEXT *allocate_next_order_table( table, symbol, lesser_context )
CONTEXT *table;
int symbol;
CONTEXT *lesser_context;
{
     CONTEXT *new_table;
     int i;
     unsigned int new_size;

     for ( i = 0 ; i <= table->max_index ; i++ )
       if (table->stats[ i ].symbol == (unsigned char) symbol )
         break;
       if ( i > table->max_index ) {
         table->max_index++;
         new_size = sizeof( LINKS );
         new_size *= table->max_index + 1;
         if ( table->links == NULL )
           table->links = (LINKS *) calloc( new_size, 1 );
         else
           table->links = (LINKS *)
             realloc( (char *) table->links, new_size );
         new_size = sizeof( STATS );
         new_size *= table->max_index + 1;
         if ( table->stats == NULL )
           table->stats = (STATS *) calloc( new_size, 1 );
         else
           table->stats = (STATS *)
             realloc( (char *) table->stats, new_size );
         if ( table->links == NULL )
           fatal_error( "Failure #6:  allocating new table" );
         if ( table->stats == NULL )
           fatal_error( "Failure #7:  allocating new table" );
         table->stats[ i ].symbol = (unsigned char) symbol;
         table->stats[ i ].counts = 0;
     }
     new_table = (CONTEXT *) calloc(sizeof( CONTEXT ), 1 );
     if ( new_table == NULL )
       fatal_error( "Failure #8:  allocating new table" );
     new_table->max_index = -1;
     table->links[ i ].next = new_table;
     new_table->lesser_context = lesser_context;
     return( new_table );
}

/*
* This routine is called to increment the counts for the current
* contexts.  It is called after a character has been encoded or
* decoded.  All it does is call update_table for each of the
* current contexts, which does the work of incrementing the count.
* This particular version of update_model() practices update exclusion,
* which means that if lower order models weren't used to encode
* or decode the character, they don't get their counts updated.
* This seems to improve compression performance quite a bit.
* To disable update exclusion, the loop would be changed to run
* from 0 to max_order, instead of current_order to max_order.
*/
void update_model( symbol )
int symbol;
{

     int i;
     int local_order;

     if ( current_order < 0 )
       local_order = 0;
     else
       local_order = current_order;
     if ( symbol >= 0 ) {
       while ( local_order <= max_order ) {
         if ( symbol >= 0 )
           update_table( contexts[ local_order ], symbol );
         local_order++;
        }
     }
     current_order = max_order;
     for ( i = 0 ; i < 256 ; i++ )
       scoreboard[ i ] = 0;
}

/*
* This routine is called to update the count for a particular symbol
* in a particular table.  The table is one of the current contexts,
* and the symbol is the last symbol encoded or decoded.  In principle
* this is a fairly simple routine, but a couple of complications make
* things a little messier.  First of all, the given table may not
* already have the symbol defined in its statistics table.  If it
* doesn't, the stats table has to grow and have the new guy added
* to it.  Secondly, the symbols are kept in sorted order by count
* in the table so that the table can be trimmed during the flush
* operation.  When this symbol is incremented, it might have to be moved
* up to reflect its new rank.  Finally, since the counters are only
* bytes, if the count reaches 255, the table absolutely must be rescaled
* to get the counts back down to a reasonable level.
*/

void update_table( table, symbol )
CONTEXT *table;
int symbol;
{
     int i;
     int index;
     unsigned char temp;
     CONTEXT *temp_ptr;
     unsigned int new_size;
/*
* First, find the symbol in the appropriate context table.  The first
* symbol in the table is the most active, so start there.
*/

     index = 0;
     while ( index <= table->max_index &&
          table->stats[index].symbol != (unsigned char) symbol )
       index++;
     if ( index > table->max_index ) {
       table->max_index++;
       new_size = sizeof( LINKS );
       new_size *= table->max_index + 1;
       if ( current_order < max_order ) {
         if ( table->max_index == 0 )
           table->links - (LINKS *) calloc( new_size, 1 );
         else
           table->links = (LINKS *)
             realloc( (char *) table->links, new_size );
         if (  table->links == NULL )
           fatal_error( "Error #9:  reallocating table space!" );
         table->links[ index ].next = NULL;
     }
     new_size = sizeof( STATS );
     new_size *= table->max_index + 1;
     if (table->max_index==0)
       table->stats = (STATS *) calloc( new_size, 1 );
     else
       table->stats = (STATS *)
         realloc( (char *) table->stats, new_size );
     if ( table->stats == NULL )
       fatal_error( "Error #10:  reallocating table space!" );
     table->stats[ index ].symbol = (unsigned char) symbol;
     table->stats[ index ].counts = 0;
   }
/*
* Now I move the symbol to the front of its list.
*/

     i = index;
     while ( i > 0 &&
       table->stats[ index ]. counts == table->stats[ i-1 ].counts )
       i--;
     if ( i != index ) {
       temp = table->stats[ index ].symbol;
       table->stats[ index ].symbol = table->stats[ i ].symbol;
       table->stats[ i ].symbol = temp;
       if ( table->links != NULL ) {
         temp_ptr = table->links[ index ].next;
         table->links[ index ].next = table->links[ i ].next;
         table->links[ i ].next = temp_ptr;
       }
       index = 1;
  }
/*
* The switch has been performed, now I can update the counts
*/
  table->stats[ index ].counts++;
  if ( table->stats[ index ].counts == 255 )
    rescale_table( table );
}
/*
* This routine is called when a given symbol needs to be encoded.
* It is the job of this routine to find the symbol in the context
* table associated with the current table, and return the low and
* high counts associated with that symbol, as well as the scale.
* Finding the table is simple.  Unfortunately, once I find the table,
* I have to build the table of cumulative counts, which is
* expensive, and is done elsewhere.  If the symbol is found in the
* table, the appropriate counts are returned.  If the symbol is
* not found, the ESCAPE symbol probabilities are returned, and
* the current order is reduced.  Note also the kludge to support
* the order -2 character set, which consists of negative numbers
* instead of unsigned chars.  This insures that no match will ever
* be found for the EOF or FLUSH symbols in  any "normal" table.
*/
int convert_int_to_symbol( c, s )
int c;
SYMBOL *s;
{
     int i;
     CONTEXT *table;

     table = contexts[ current_order ];
     totalize_table( table );
     s->scale = totals[ 0 ];
     if ( current_order == –2 )
       c = -c;
     for ( i = 0 ; i <= table->max_index ; i++ ) {
       if ( c == (int) table->stats[ i ].symbol ) {
         if ( table->stats[ i ].counts == 0 )
           break;
         s->low_count = totals[ i+2 ];
         s->high_count = totals[ i+1 ];
         return( 0 );
       }
     }
     s->low_count = totals[ 1 ];
     s->high-count = totals[ 0 ];
     current_order––;
     return( 1 );
}
/*
* This routine is called when decoding an arithmetic number.  In
* order to decode the present symbol, the current scale in the
* model must be determined.  This requires looking up the current
* table, then building the totals table.  Once that is done, the
* cumulative total table has the symbol scale at element 0.
*/

void get_symbol_scale( s)
SYMBOL *s;
{
     CONTEXT *table;

     table = contexts[ current_order ];
     totalize_table( table );
     s->scale = totals[ 0 ];
}
/*
* This routine is called during decoding.  It is given a count that
* came out of the arithmetic decoder, and has to find the symbol that
* matches the count.  The cumulative totals are already stored in the
* totals[] table, from the call to get_symbol-scale, so this routine
* just has to look through that table.  Once the match is found,
* the appropriate character is returned to the caller.  Two possible
* complications.  First, the character might be the ESCAPE character,
* in which case the current_order has to be decremented.  The other
* complication.  First, the character might be the ESCAPE character,
* in which case the current_order has to be decremented.  The other
* complication is that the order might be -2, in which case we return
* the negative of the symbol so it isn't confused with a normal
* symbol.
*/
int convert_symbol_to_int( count, s )
int count;
SYMBOL *s;
{
     int c;
     CONTEXT *table;

     table - contexts[ current_order ];
     for ( c = 0; count < totals[ c ] ; c++ )
       ;
     s->high_count = totals[ c - 1 ];
     s->low_count = totals[ c ]:
     if ( c == 1 ) {
       current_order––;
       return( ESCAPE );
     }
     if ( current_order < -1 )
       return( (int) -table->stats[ c-2 ].symbol );
     else
       return( table->stats[ c-2 ].symbol );
}

/*
* After the model has been updated for a new character, this routine
* is called to "shift" into the new context.  For example, if the
* last context was "ABC", and the symbol 'D' had just been processed,
* this routine would want to update the context pointers to that
* context[1]=="D", contexts[2]=="CD" and contexts[3]=="BCD".  The
* potential problem is that some of these tables may not exist.
* The way this is handled is by the shift_to_next_context routine.
* It is passed a pointer to the "ABC" context, along with the symbol
* 'D', and its job is to return a pointer to "BCD".  Once we have
* "BCD", we can follow the lesser context pointers in order to get
* the pointers to "CD" and "C".  The hard work was done in
* shift_to_context().
*/

void add_character_to_model( c )
int c;
{

     int i;
     if ( max_order < 0 || c < 0 )
       return;
     contexts[ max_order ] =
       shift_to_next_context( contexts[ max_order ], c, max_order );
     for ( i = max_order-1 ; i > 0 ; i–– )
       contexts[ i ] = contexts[ i+1 ]->lesser_context;
}

/*
* This routine is called when adding a new character to the model.  From
* the previous example, if the current context was "ABC", and the new
* symbol was 'D', this routine would get called with a pointer to
* context table "ABC", and symbol 'D', with order max_order.  What this
* routine needs to do then is to find the context table "BCD".  This
* should be an easy job, and it is if the table already exists.  All
* we have to in that case is follow the back pointer from "ABC" to "BC".
* We then search the link table of "BC" until we find the link to "D".
* That link points to "BCD", and that value is then returned to the
* caller.  The problem crops up when "BC" doesn't have a pointer to
* "BCD".  This generally means that the "BCD" context has not appeared
* yet.  When this happens, it means a new table has to be created and
* added to the "BC" table.  That can be done with a single call to
* the allocate_new_table routine.  The only problem is that the
* allocate_new_table routine wants to know what the lesser context for
* the new table is going to be.  In other words, when I create "BCD",
* I need to know where "CD" is located.  In order to find "CD", I
* have to recursively call shift_to_next_context, passing it a pointer
* to context "C" and the symbol 'D'.  It then returns a pointer to
* "CD", which I use to create the "BCD" table.  The recursion is
* guaranteed to end if it ever gets to order -1, because the null table
* is guaranteed to have a link for every symbol to the order 0 table.
* This is the most complicated part of the modeling program, but it is
* necessary for performance reasons.
*/
CONTEXT *shift_to_next_context( table, c, order )
CONTEXT *table;
int c;
int order;
{

     int i;
     CONTEXT *new_lesser;
/*
* First, try to find the new context by backing up to the lesser
* context and searching its link table.  If I find the link, we take
* a quick and easy exit, returning the link.  Note that there is a
* special kludge for context order 0.  We know for a fact that
* the lesser context pointer at order 0 points to the null table,
* order -1, and we know that the -1 table only has a single link
* pointer, which points back to the order 0 table.
*/
     table = table->lesser_context;
     if ( order == 0 )
       return( table->links[ 0 ].next );
     for ( i = 0 ; i <= table->max_index ; i++ )
       if ( table->stats[ i ].symbol == (unsigned char) c )
         if ( table->links[ i ].next != NULL)
           return( table->links[ i ].next );
         else
           break;
/*
* If I get here, it means the new context did not exist.  I have to
* create the new context, add a link to it here, and add the backwards
* link to *his* previous context.  Creating the table and adding it to
* this table is pretty easy, but adding the back pointer isn't.  Since
* creating the new back pointer isn't easy, I duck my responsibility
* and recurse to myself in order to pick it up.
*/
  new_lesser = shift_to_next_context( table, c, order-1 );
/*
* Now that I have the back pointer for this table, I can make a call
* to a utility to allocate the new table.
*/
  table = allocate_next_order_table( table, c, new_lesser );
  return( table );
}

/*
* Rescaling the table needs to be done for one of three reasons.
* First, if the maximum count for the table has exceeded 16383, it
* means that arithmetic coding using 16 and 32 bit registers might
* no longer work.  Secondly, if an individual symbol count has
* reached 255, it will no longer fit in a byte.  Third, if the
* current model isn't compressing well, the compressor program may
* want to rescale all tables in order to give more weight to newer
* statistics.  All this routine does is divide each count by 2.
* If any counts drop to 0, the counters can be removed from the
* stats table, but only if this is a leaf context.  Otherwise, we
* might cut a link to a higher order table.
*/
void rescale_table( table )
CONTEXT *table;
{
     int i;

     if ( table->max_index == -1 )
       return;
     for ( i = 0 ; i <= table->max_index ; i ++ )
       table->stats[ i ].counts /= 2;
     if ( table->stats[ table]>max_index ].counts == 0 &&
         table->links == NULL ) {
         while ( table->stats[ table->max_index ].counts == 0 &&
              table->max_index >= 0 )
           table->max_index––;
         if ( table->max_index == -1 ) {
           free( (char *) table->stats );
           table->stats = NULL;
         } else {
           table->stats = (STATS *)
             realloc( (char *) table->stats,
                     sizeof( STATS ) * ( table->max_index + 1 ) );
           if ( table->stats == NULL )
             fatal_error( "Error #11: reallocating stats space!" );
     }
   }
}
/*
* This routine has the job of creating a cumulative totals table for
* a given context.  The cumulative low and high for symbol c are going to
* be stored in totals[c+2] and totals[c+1].  Locations 0 and 1 are
* reserved for the special ESCAPE symbol.  The ESCAPE symbol
* count is calculated dynamically, and changes based on what the
* current context looks like.  Note also that this routine ignores
* any counts for symbols that have already shown up in the scoreboard,
* and it adds all new symbols found here to the scoreboard.  This
* allows us to exclude counts of symbols that have already appeared in
*  higher order contexts, improving compression quite a bit.
*/

void totalize_table( table )
CONTEXT *table;
{

     int i;
     unsigned char max;

     for ( ; ; ) {
       max = 0;
       i = table->max_index + 2;
       totals[ i ] = 0;
       for ( ; i > 1 ; i- ) {
         totals[ i-1 ] = totals[ i ];
         if ( table->stats[ i-2 ].counts )
           if ( ( current_order == -2 ) ||
             scoreboard[ table->stats[ i-2 ].symbol ] == 0 )
             totals[ i-1 ] += table->stats[ i-2].counts;
         if ( table->stats[ i-2 ].counts > max )
           max = table->stats[ i-2 ].counts;
       }
/*
* Here is where the escape calculation needs to take place.
*/

     if ( max == 0 )
       totals[ 0 ] = 1;
     else {
       totals[ 0 ] = (short int) ( 256 - table->max_index );
       totals[ 0 ] *= table->max_index;
       totals[ 0 ] /= 256;
       totals[ 0 ] /= max;
       totals[ 0 ]++;
       totals[ 0 ] += totals[ 1 ];
     }
     if ( totals[ 0 ] < MAXIMUM_SCALE )
       break;
     rescale_table( table );
     }
     for ( i = 0 ; i < table->max_index ; i++ )
       if (table->stats[i].counts != 0)
         scoreboard[ table->stats[ i ].symbol ] = 1;
}

/*
* This routine is called when the entire model is to be flushed.
* This is done in an attempt to improve the compression ratio by
* giving greater weight to upcoming statistics.  This routine
* starts at the given table, and recursively calls itself to
* rescale every table in its list of links.  The table itself
* is then rescaled.
*/

void recursive_flush( table )
CONTEXT *table;
{
     int i;
     if ( table->links != NULL )
       for ( i = 0 ; i <= table->max_index ; i++ )
         if ( table->links[ i ].next != NULL )
           recursive_flush( table->links[ i ].next );
     rescale_table( table );
}

/*
* This routine is called to flush the whole table, which it does
* by calling the recursive flush routine starting at the order 0
* table.
*/

void flush_model()
{
  putc( 'F', stdout );
  recursive_flush( contexts[ 0 ] );
}

/*
* Everything from here down define the arithmetic coder section
* of the program.
*/

*/
* These four variables define the current state of the arithmetic
* coder/decoder.  They are assumed to be 16 bits long.  Note that
* by declaring them as short ints, they will actually be 16 bits
* on most 80X86 and 680X0 machines, as well as VAXen.
*/
static unsigned short int code;/* The present input code value      */
static unsigned short int low; /* Start of the current code range   */
static unsigned short int high;/* End of the current code range     */
long underflow_bits;           /* Number of underflow bits pending  
*/
/*
* This routine must be called to initialize the encoding process.
* The high register is initialized to all 1s, and it is assumed that
* it has an infinite string of 1s to be shifted into the lower bit
* positions when needed.
*/

void initialize_arithmetic_encoder()
{
     low = 0;
     high = 0xffff;
     underflow_bits = 0;
}

/*
* At the end of the encoding process, there are still significant
* bits left in the high and low registers.  We output two bits,
* plus as many underflow bits as are necessary.
*/
void flush_arithmetic_encoder( stream )
BIT_FILE *stream;
{
      OutputBit( stream, low & 0x4000 );
      underflow_bits++;
      while ( underflow_bits-- > 0 )
        OutputBit( stream, ~low & 0X4000 );
      OutputBits( stream, 0L, 16 );
}

/*
* This routine is called to encode a symbol.  The symbol is passed
* in the SYMBOL structure as a low count, a high count, and a range,
* instead of the more conventional probability ranges.  The encoding
* process takes two steps.  First, the values of high and low are
* updated to take into account the range restriction created by the
* new symbol.  Then, as many bits as possible are shifted out to
* the output stream.  Finally, high and low are stable again and
* the routine returns.
*/

void encode_symbol( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
  long range;

/*
*  These three lines rescale high and low for the new symbol.
*/

     range = (long) ( high-low ) + 1;
     high = low + (unsigned short int)
                  (( range * s->high_count ) / s->scale -1 );
     low = low + (unsigned short int)
                  (( range * s->low_count ) / s->scale );

/*
* This loop turns out new bits until high and low are far enough
* apart to have stabilized.
*/

  for ( ; ; ) {

/*
* If this test passes, it means that the MSDigits match, and can
* be sent to the output stream.
*/

     if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
       OutputBit( stream, high & 0x8000 );
       while ( underflow_bits > 0 ) {
         OutputBit( stream, ~high & 0x8000 );
         underflow_bits--;
       }
     }

/*
* If this test passes, the numbers are in danger of underflow, because
* the MSDigits don't match, and the 2nd digits are just one apart.
*/

     else if ( ( low & 0x4000 ) && !( high & 0x4000 )) {
       underflow_bits += 1;
       low &= 0x3fff;
       high |= 0x4000;
     } else
       return ;
     low <<= 1;
     high <<= 1;
     high  |= 1;

   }
}
/*
* When decoding, this routine is called to figure out which symbol
* is presently waiting to be decoded.  This routine expects to get
* the current model scale in the s->scale parameter, and it returns
* a count that corresponds to the present floating point code;
*
* code = count / s->scale
*/
short int get_current_count( s )
SYMBOL *s;
{
     long range;
     short int count;

     range = (long) ( high - low ) + 1;
     count = (short int)
          ((((long) ( code - low ) + 1 ) * s->scale-1 ) / range );
     return( count );
}
/*
* This routine is called to initialize the state of the arithmetic
* decoder.  This involves initializing the high and low registers
* to their conventional starting values, plus reading the first
* 16 bits from the input stream into the code value.
*/
void initialize_arithmetic_decoder( stream )
BIT_FILE *stream;
{
     int i;

     code = 0;
     for ( i = 0 ; i < 16 ; i++ ) {
        code <<= 1;
        code += InputBit( stream );
     }
     low = 0;
     high = 0xffff;
}

/*
* Just figuring out what the present symbol is doesn't remove
* it from the input bit stream.  After the character has been
* decoded, this routine has to be called to remove it from the
* input stream.
*/
void remove_symbol_from_stream( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
  long range;

/*
* First, the range is expanded to account for the symbol removal.
*/

     range = (long)( high - low ) + 1;
     high = low + (unsigned short int)
                  (( range * s->high_count ) / s->scale -1 );
     low = low + (unsigned short int)
                  (( range * s->low_count ) / s->scale );

/*
* Next, any possible bits are shipped out.
*/

     for ( ; ; ) {

/*
* If the MSDigits match, the bits will be shifted out.
*/

     if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
     }

/*
* Else, if underflow is threatening, shift out the 2nd MSDigit.
*/

     else if ((low & 0x4000) == 0x4000  && (high & 0x4000) == 0 ) {
       code ^= 0x4000;
       low &= 0x3fff;
       high |= 0x4000;
     } else

/*
* Otherwise, nothing can be shifted out, so I return.
*/

        return;
     low <<= 1;
     high <<= 1;
     high |= 1;
     code <<= 1;
     code += InputBit( stream );
   }
}
/************************** End of ARITH-N.C ***************************/
Table of Contents
ARITH-N Listing