| /* gznorm.c -- normalize a gzip stream | 
 |  * Copyright (C) 2018 Mark Adler | 
 |  * For conditions of distribution and use, see copyright notice in zlib.h | 
 |  * Version 1.0  7 Oct 2018  Mark Adler */ | 
 |  | 
 | // gznorm takes a gzip stream, potentially containing multiple members, and | 
 | // converts it to a gzip stream with a single member. In addition the gzip | 
 | // header is normalized, removing the file name and time stamp, and setting the | 
 | // other header contents (XFL, OS) to fixed values. gznorm does not recompress | 
 | // the data, so it is fast, but no advantage is gained from the history that | 
 | // could be available across member boundaries. | 
 |  | 
 | #include <stdio.h>      // fread, fwrite, putc, fflush, ferror, fprintf, | 
 |                         // vsnprintf, stdout, stderr, NULL, FILE | 
 | #include <stdlib.h>     // malloc, free | 
 | #include <string.h>     // strerror | 
 | #include <errno.h>      // errno | 
 | #include <stdarg.h>     // va_list, va_start, va_end | 
 | #include "zlib.h"       // inflateInit2, inflate, inflateReset, inflateEnd, | 
 |                         // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK, | 
 |                         // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR, | 
 |                         // Z_MEM_ERROR | 
 |  | 
 | #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) | 
 | #  include <fcntl.h> | 
 | #  include <io.h> | 
 | #  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) | 
 | #else | 
 | #  define SET_BINARY_MODE(file) | 
 | #endif | 
 |  | 
 | #define local static | 
 |  | 
 | // printf to an allocated string. Return the string, or NULL if the printf or | 
 | // allocation fails. | 
 | local char *aprintf(char *fmt, ...) { | 
 |     // Get the length of the result of the printf. | 
 |     va_list args; | 
 |     va_start(args, fmt); | 
 |     int len = vsnprintf(NULL, 0, fmt, args); | 
 |     va_end(args); | 
 |     if (len < 0) | 
 |         return NULL; | 
 |  | 
 |     // Allocate the required space and printf to it. | 
 |     char *str = malloc(len + 1); | 
 |     if (str == NULL) | 
 |         return NULL; | 
 |     va_start(args, fmt); | 
 |     vsnprintf(str, len + 1, fmt, args); | 
 |     va_end(args); | 
 |     return str; | 
 | } | 
 |  | 
 | // Return with an error, putting an allocated error message in *err. Doing an | 
 | // inflateEnd() on an already ended state, or one with state set to Z_NULL, is | 
 | // permitted. | 
 | #define BYE(...) \ | 
 |     do { \ | 
 |         inflateEnd(&strm); \ | 
 |         *err = aprintf(__VA_ARGS__); \ | 
 |         return 1; \ | 
 |     } while (0) | 
 |  | 
 | // Chunk size for buffered reads and for decompression. Twice this many bytes | 
 | // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned. | 
 | #define CHUNK 16384 | 
 |  | 
 | // Read a gzip stream from in and write an equivalent normalized gzip stream to | 
 | // out. If given no input, an empty gzip stream will be written. If successful, | 
 | // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the | 
 | // details of the error are returned in *err, a pointer to an allocated string. | 
 | // | 
 | // The input may be a stream with multiple gzip members, which is converted to | 
 | // a single gzip member on the output. Each gzip member is decompressed at the | 
 | // level of deflate blocks. This enables clearing the last-block bit, shifting | 
 | // the compressed data to concatenate to the previous member's compressed data, | 
 | // which can end at an arbitrary bit boundary, and identifying stored blocks in | 
 | // order to resynchronize those to byte boundaries. The deflate compressed data | 
 | // is terminated with a 10-bit empty fixed block. If any members on the input | 
 | // end with a 10-bit empty fixed block, then that block is excised from the | 
 | // stream. This avoids appending empty fixed blocks for every normalization, | 
 | // and assures that gzip_normalize applied a second time will not change the | 
 | // input. The pad bits after stored block headers and after the final deflate | 
 | // block are all forced to zeros. | 
 | local int gzip_normalize(FILE *in, FILE *out, char **err) { | 
 |     // initialize the inflate engine to process a gzip member | 
 |     z_stream strm; | 
 |     strm.zalloc = Z_NULL; | 
 |     strm.zfree = Z_NULL; | 
 |     strm.opaque = Z_NULL; | 
 |     strm.avail_in = 0; | 
 |     strm.next_in = Z_NULL; | 
 |     if (inflateInit2(&strm, 15 + 16) != Z_OK) | 
 |         BYE("out of memory"); | 
 |  | 
 |     // State while processing the input gzip stream. | 
 |     enum {              // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ... | 
 |         BETWEEN,        // between gzip members (must end in this state) | 
 |         HEAD,           // reading a gzip header | 
 |         BLOCK,          // reading deflate blocks | 
 |         TAIL            // reading a gzip trailer | 
 |     } state = BETWEEN;              // current component being processed | 
 |     unsigned long crc = 0;          // accumulated CRC of uncompressed data | 
 |     unsigned long len = 0;          // accumulated length of uncompressed data | 
 |     unsigned long buf = 0;          // deflate stream bit buffer of num bits | 
 |     int num = 0;                    // number of bits in buf (at bottom) | 
 |  | 
 |     // Write a canonical gzip header (no mod time, file name, comment, extra | 
 |     // block, or extra flags, and OS is marked as unknown). | 
 |     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); | 
 |  | 
 |     // Process the gzip stream from in until reaching the end of the input, | 
 |     // encountering invalid input, or experiencing an i/o error. | 
 |     int more;                       // true if not at the end of the input | 
 |     do { | 
 |         // State inside this loop. | 
 |         unsigned char *put;         // next input buffer location to process | 
 |         int prev;                   // number of bits from previous block in | 
 |                                     // the bit buffer, or -1 if not at the | 
 |                                     // start of a block | 
 |         unsigned long long memb;    // uncompressed length of member | 
 |         size_t tail;                // number of trailer bytes read (0..8) | 
 |         unsigned long part;         // accumulated trailer component | 
 |  | 
 |         // Get the next chunk of input from in. | 
 |         unsigned char dat[CHUNK]; | 
 |         strm.avail_in = fread(dat, 1, CHUNK, in); | 
 |         if (strm.avail_in == 0) | 
 |             break; | 
 |         more = strm.avail_in == CHUNK; | 
 |         strm.next_in = put = dat; | 
 |  | 
 |         // Run that chunk of input through the inflate engine to exhaustion. | 
 |         do { | 
 |             // At this point it is assured that strm.avail_in > 0. | 
 |  | 
 |             // Inflate until the end of a gzip component (header, deflate | 
 |             // block, trailer) is reached, or until all of the chunk is | 
 |             // consumed. The resulting decompressed data is discarded, though | 
 |             // the total size of the decompressed data in each member is | 
 |             // tracked, for the calculation of the total CRC. | 
 |             do { | 
 |                 // inflate and handle any errors | 
 |                 unsigned char scrap[CHUNK]; | 
 |                 strm.avail_out = CHUNK; | 
 |                 strm.next_out = scrap; | 
 |                 int ret = inflate(&strm, Z_BLOCK); | 
 |                 if (ret == Z_MEM_ERROR) | 
 |                     BYE("out of memory"); | 
 |                 if (ret == Z_DATA_ERROR) | 
 |                     BYE("input invalid: %s", strm.msg); | 
 |                 if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END) | 
 |                     BYE("internal error"); | 
 |  | 
 |                 // Update the number of uncompressed bytes generated in this | 
 |                 // member. The actual count (not modulo 2^32) is required to | 
 |                 // correctly compute the total CRC. | 
 |                 unsigned got = CHUNK - strm.avail_out; | 
 |                 memb += got; | 
 |                 if (memb < got) | 
 |                     BYE("overflow error"); | 
 |  | 
 |                 // Continue to process this chunk until it is consumed, or | 
 |                 // until the end of a component (header, deflate block, or | 
 |                 // trailer) is reached. | 
 |             } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0); | 
 |  | 
 |             // Since strm.avail_in was > 0 for the inflate call, some input was | 
 |             // just consumed. It is therefore assured that put < strm.next_in. | 
 |  | 
 |             // Disposition the consumed component or part of a component. | 
 |             switch (state) { | 
 |                 case BETWEEN: | 
 |                     state = HEAD; | 
 |                     // Fall through to HEAD when some or all of the header is | 
 |                     // processed. | 
 |  | 
 |                 case HEAD: | 
 |                     // Discard the header. | 
 |                     if (strm.data_type & 0x80) { | 
 |                         // End of header reached -- deflate blocks follow. | 
 |                         put = strm.next_in; | 
 |                         prev = num; | 
 |                         memb = 0; | 
 |                         state = BLOCK; | 
 |                     } | 
 |                     break; | 
 |  | 
 |                 case BLOCK: | 
 |                     // Copy the deflate stream to the output, but with the | 
 |                     // last-block-bit cleared. Re-synchronize stored block | 
 |                     // headers to the output byte boundaries. The bytes at | 
 |                     // put..strm.next_in-1 is the compressed data that has been | 
 |                     // processed and is ready to be copied to the output. | 
 |  | 
 |                     // At this point, it is assured that new compressed data is | 
 |                     // available, i.e., put < strm.next_in. If prev is -1, then | 
 |                     // that compressed data starts in the middle of a deflate | 
 |                     // block. If prev is not -1, then the bits in the bit | 
 |                     // buffer, possibly combined with the bits in *put, contain | 
 |                     // the three-bit header of the new deflate block. In that | 
 |                     // case, prev is the number of bits from the previous block | 
 |                     // that remain in the bit buffer. Since num is the number | 
 |                     // of bits in the bit buffer, we have that num - prev is | 
 |                     // the number of bits from the new block currently in the | 
 |                     // bit buffer. | 
 |  | 
 |                     // If strm.data_type & 0xc0 is 0x80, then the last byte of | 
 |                     // the available compressed data includes the last bits of | 
 |                     // the end of a deflate block. In that case, that last byte | 
 |                     // also has strm.data_type & 0x1f bits of the next deflate | 
 |                     // block, in the range 0..7. If strm.data_type & 0xc0 is | 
 |                     // 0xc0, then the last byte of the compressed data is the | 
 |                     // end of the deflate stream, followed by strm.data_type & | 
 |                     // 0x1f pad bits, also in the range 0..7. | 
 |  | 
 |                     // Set bits to the number of bits not yet consumed from the | 
 |                     // last byte. If we are at the end of the block, bits is | 
 |                     // either the number of bits in the last byte belonging to | 
 |                     // the next block, or the number of pad bits after the | 
 |                     // final block. In either of those cases, bits is in the | 
 |                     // range 0..7. | 
 |                     ;                   // (required due to C syntax oddity) | 
 |                     int bits = strm.data_type & 0x1f; | 
 |  | 
 |                     if (prev != -1) { | 
 |                         // We are at the start of a new block. Clear the last | 
 |                         // block bit, and check for special cases. If it is a | 
 |                         // stored block, then emit the header and pad to the | 
 |                         // next byte boundary. If it is a final, empty fixed | 
 |                         // block, then excise it. | 
 |  | 
 |                         // Some or all of the three header bits for this block | 
 |                         // may already be in the bit buffer. Load any remaining | 
 |                         // header bits into the bit buffer. | 
 |                         if (num - prev < 3) { | 
 |                             buf += (unsigned long)*put++ << num; | 
 |                             num += 8; | 
 |                         } | 
 |  | 
 |                         // Set last to have a 1 in the position of the last | 
 |                         // block bit in the bit buffer. | 
 |                         unsigned long last = (unsigned long)1 << prev; | 
 |  | 
 |                         if (((buf >> prev) & 7) == 3) { | 
 |                             // This is a final fixed block. Load at least ten | 
 |                             // bits from this block, including the header, into | 
 |                             // the bit buffer. We already have at least three, | 
 |                             // so at most one more byte needs to be loaded. | 
 |                             if (num - prev < 10) { | 
 |                                 if (put == strm.next_in) | 
 |                                     // Need to go get and process more input. | 
 |                                     // We'll end up back here to finish this. | 
 |                                     break; | 
 |                                 buf += (unsigned long)*put++ << num; | 
 |                                 num += 8; | 
 |                             } | 
 |                             if (((buf >> prev) & 0x3ff) == 3) { | 
 |                                 // That final fixed block is empty. Delete it | 
 |                                 // to avoid adding an empty block every time a | 
 |                                 // gzip stream is normalized. | 
 |                                 num = prev; | 
 |                                 buf &= last - 1;    // zero the pad bits | 
 |                             } | 
 |                         } | 
 |                         else if (((buf >> prev) & 6) == 0) { | 
 |                             // This is a stored block. Flush to the next | 
 |                             // byte boundary after the three-bit header. | 
 |                             num = (prev + 10) & ~7; | 
 |                             buf &= last - 1;        // zero the pad bits | 
 |                         } | 
 |  | 
 |                         // Clear the last block bit. | 
 |                         buf &= ~last; | 
 |  | 
 |                         // Write out complete bytes in the bit buffer. | 
 |                         while (num >= 8) { | 
 |                             putc(buf, out); | 
 |                             buf >>= 8; | 
 |                             num -= 8; | 
 |                         } | 
 |  | 
 |                         // If no more bytes left to process, then we have | 
 |                         // consumed the byte that had bits from the next block. | 
 |                         if (put == strm.next_in) | 
 |                             bits = 0; | 
 |                     } | 
 |  | 
 |                     // We are done handling the deflate block header. Now copy | 
 |                     // all or almost all of the remaining compressed data that | 
 |                     // has been processed so far. Don't copy one byte at the | 
 |                     // end if it contains bits from the next deflate block or | 
 |                     // pad bits at the end of a deflate block. | 
 |  | 
 |                     // mix is 1 if we are at the end of a deflate block, and if | 
 |                     // some of the bits in the last byte follow this block. mix | 
 |                     // is 0 if we are in the middle of a deflate block, if the | 
 |                     // deflate block ended on a byte boundary, or if all of the | 
 |                     // compressed data processed so far has been consumed. | 
 |                     int mix = (strm.data_type & 0x80) && bits; | 
 |  | 
 |                     // Copy all of the processed compressed data to the output, | 
 |                     // except for the last byte if it contains bits from the | 
 |                     // next deflate block or pad bits at the end of the deflate | 
 |                     // stream. Copy the data after shifting in num bits from | 
 |                     // buf in front of it, leaving num bits from the end of the | 
 |                     // compressed data in buf when done. | 
 |                     unsigned char *end = strm.next_in - mix; | 
 |                     if (put < end) { | 
 |                         if (num) | 
 |                             // Insert num bits from buf before the data being | 
 |                             // copied. | 
 |                             do { | 
 |                                 buf += (unsigned)(*put++) << num; | 
 |                                 putc(buf, out); | 
 |                                 buf >>= 8; | 
 |                             } while (put < end); | 
 |                         else { | 
 |                             // No shifting needed -- write directly. | 
 |                             fwrite(put, 1, end - put, out); | 
 |                             put = end; | 
 |                         } | 
 |                     } | 
 |  | 
 |                     // Process the last processed byte if it wasn't written. | 
 |                     if (mix) { | 
 |                         // Load the last byte into the bit buffer. | 
 |                         buf += (unsigned)(*put++) << num; | 
 |                         num += 8; | 
 |  | 
 |                         if (strm.data_type & 0x40) { | 
 |                             // We are at the end of the deflate stream and | 
 |                             // there are bits pad bits. Discard the pad bits | 
 |                             // and write a byte to the output, if available. | 
 |                             // Leave the num bits left over in buf to prepend | 
 |                             // to the next deflate stream. | 
 |                             num -= bits; | 
 |                             if (num >= 8) { | 
 |                                 putc(buf, out); | 
 |                                 num -= 8; | 
 |                                 buf >>= 8; | 
 |                             } | 
 |  | 
 |                             // Force the pad bits in the bit buffer to zeros. | 
 |                             buf &= ((unsigned long)1 << num) - 1; | 
 |  | 
 |                             // Don't need to set prev here since going to TAIL. | 
 |                         } | 
 |                         else | 
 |                             // At the end of an internal deflate block. Leave | 
 |                             // the last byte in the bit buffer to examine on | 
 |                             // the next entry to BLOCK, when more bits from the | 
 |                             // next block will be available. | 
 |                             prev = num - bits;      // number of bits in buffer | 
 |                                                     // from current block | 
 |                     } | 
 |  | 
 |                     // Don't have a byte left over, so we are in the middle of | 
 |                     // a deflate block, or the deflate block ended on a byte | 
 |                     // boundary. Set prev appropriately for the next entry into | 
 |                     // BLOCK. | 
 |                     else if (strm.data_type & 0x80) | 
 |                         // The block ended on a byte boundary, so no header | 
 |                         // bits are in the bit buffer. | 
 |                         prev = num; | 
 |                     else | 
 |                         // In the middle of a deflate block, so no header here. | 
 |                         prev = -1; | 
 |  | 
 |                     // Check for the end of the deflate stream. | 
 |                     if ((strm.data_type & 0xc0) == 0xc0) { | 
 |                         // That ends the deflate stream on the input side, the | 
 |                         // pad bits were discarded, and any remaining bits from | 
 |                         // the last block in the stream are saved in the bit | 
 |                         // buffer to prepend to the next stream. Process the | 
 |                         // gzip trailer next. | 
 |                         tail = 0; | 
 |                         part = 0; | 
 |                         state = TAIL; | 
 |                     } | 
 |                     break; | 
 |  | 
 |                 case TAIL: | 
 |                     // Accumulate available trailer bytes to update the total | 
 |                     // CRC and the total uncompressed length. | 
 |                     do { | 
 |                         part = (part >> 8) + ((unsigned long)(*put++) << 24); | 
 |                         tail++; | 
 |                         if (tail == 4) { | 
 |                             // Update the total CRC. | 
 |                             z_off_t len2 = memb; | 
 |                             if (len2 < 0 || (unsigned long long)len2 != memb) | 
 |                                 BYE("overflow error"); | 
 |                             crc = crc ? crc32_combine(crc, part, len2) : part; | 
 |                             part = 0; | 
 |                         } | 
 |                         else if (tail == 8) { | 
 |                             // Update the total uncompressed length. (It's ok | 
 |                             // if this sum is done modulo 2^32.) | 
 |                             len += part; | 
 |  | 
 |                             // At the end of a member. Set up to inflate an | 
 |                             // immediately following gzip member. (If we made | 
 |                             // it this far, then the trailer was valid.) | 
 |                             if (inflateReset(&strm) != Z_OK) | 
 |                                 BYE("internal error"); | 
 |                             state = BETWEEN; | 
 |                             break; | 
 |                         } | 
 |                     } while (put < strm.next_in); | 
 |                     break; | 
 |             } | 
 |  | 
 |             // Process the input buffer until completely consumed. | 
 |         } while (strm.avail_in > 0); | 
 |  | 
 |         // Process input until end of file, invalid input, or i/o error. | 
 |     } while (more); | 
 |  | 
 |     // Done with the inflate engine. | 
 |     inflateEnd(&strm); | 
 |  | 
 |     // Verify the validity of the input. | 
 |     if (state != BETWEEN) | 
 |         BYE("input invalid: incomplete gzip stream"); | 
 |  | 
 |     // Write the remaining deflate stream bits, followed by a terminating | 
 |     // deflate fixed block. | 
 |     buf += (unsigned long)3 << num; | 
 |     putc(buf, out); | 
 |     putc(buf >> 8, out); | 
 |     if (num > 6) | 
 |         putc(0, out); | 
 |  | 
 |     // Write the gzip trailer, which is the CRC and the uncompressed length | 
 |     // modulo 2^32, both in little-endian order. | 
 |     putc(crc, out); | 
 |     putc(crc >> 8, out); | 
 |     putc(crc >> 16, out); | 
 |     putc(crc >> 24, out); | 
 |     putc(len, out); | 
 |     putc(len >> 8, out); | 
 |     putc(len >> 16, out); | 
 |     putc(len >> 24, out); | 
 |     fflush(out); | 
 |  | 
 |     // Check for any i/o errors. | 
 |     if (ferror(in) || ferror(out)) | 
 |         BYE("i/o error: %s", strerror(errno)); | 
 |  | 
 |     // All good! | 
 |     *err = NULL; | 
 |     return 0; | 
 | } | 
 |  | 
 | // Normalize the gzip stream on stdin, writing the result to stdout. | 
 | int main(void) { | 
 |     // Avoid end-of-line conversions on evil operating systems. | 
 |     SET_BINARY_MODE(stdin); | 
 |     SET_BINARY_MODE(stdout); | 
 |  | 
 |     // Normalize from stdin to stdout, returning 1 on error, 0 if ok. | 
 |     char *err; | 
 |     int ret = gzip_normalize(stdin, stdout, &err); | 
 |     if (ret) | 
 |         fprintf(stderr, "gznorm error: %s\n", err); | 
 |     free(err); | 
 |     return ret; | 
 | } |