453 lines
16 KiB
C
453 lines
16 KiB
C
/* pigz.c -- parallel implementation of gzip
|
|
* Copyright (C) 2007 Mark Adler
|
|
* Version 1.1 28 January 2007 Mark Adler
|
|
*/
|
|
|
|
/* Version history:
|
|
1.0 17 Jan 2007 First version
|
|
1.1 28 Jan 2007 Avoid void * arithmetic (some compilers don't get that)
|
|
Add note about requiring zlib 1.2.3
|
|
Allow compression level 0 (no compression)
|
|
Completely rewrite parallelism -- add a write thread
|
|
Use deflateSetDictionary() to make use of history
|
|
Tune argument defaults to best performance on four cores
|
|
*/
|
|
|
|
/*
|
|
pigz compresses from stdin to stdout using threads to make use of multiple
|
|
processors and cores. The input is broken up into 128 KB chunks, and each
|
|
is compressed separately. The CRC for each chunk is also calculated
|
|
separately. The compressed chunks are written in order to the output,
|
|
and the overall CRC is calculated from the CRC's of the chunks.
|
|
|
|
The compressed data format generated is the gzip format using the deflate
|
|
compression method. First a gzip header is written, followed by raw deflate
|
|
partial streams. They are partial, in that they do not have a terminating
|
|
block. At the end, the deflate stream is terminated with a final empty
|
|
static block, and lastly a gzip trailer is written with the CRC and the
|
|
number of input bytes.
|
|
|
|
Each raw deflate partial stream is terminated by an empty stored block
|
|
(using the Z_SYNC_FLUSH option of zlib), in order to end that partial
|
|
bit stream at a byte boundary. That allows the partial streams to be
|
|
concantenated simply as sequences of bytes. This adds a very small four
|
|
or five byte overhead to the output for each input chunk.
|
|
|
|
zlib's crc32_combine() routine allows the calcuation of the CRC of the
|
|
entire input using the independent CRC's of the chunks. pigz requires zlib
|
|
version 1.2.3 or later, since that is the first version that provides the
|
|
crc32_combine() function.
|
|
|
|
pigz uses the POSIX pthread library for thread control and communication.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <pthread.h>
|
|
#include <sys/types.h>
|
|
#include <sys/uio.h>
|
|
#include <unistd.h>
|
|
#include "zlib.h"
|
|
|
|
#define local static
|
|
|
|
/* exit with error */
|
|
local void bail(char *msg)
|
|
{
|
|
fprintf(stderr, "pigz abort: %s\n", msg);
|
|
exit(1);
|
|
}
|
|
|
|
/* read up to len bytes into buf, repeating read() calls as needed */
|
|
local size_t readn(int desc, unsigned char *buf, size_t len)
|
|
{
|
|
ssize_t ret;
|
|
size_t got;
|
|
|
|
got = 0;
|
|
while (len) {
|
|
ret = read(desc, buf, len);
|
|
if (ret < 0)
|
|
bail("read error");
|
|
if (ret == 0)
|
|
break;
|
|
buf += ret;
|
|
len -= ret;
|
|
got += ret;
|
|
}
|
|
return got;
|
|
}
|
|
|
|
/* write len bytes, repeating write() calls as needed */
|
|
local void writen(int desc, unsigned char *buf, size_t len)
|
|
{
|
|
ssize_t ret;
|
|
|
|
while (len) {
|
|
ret = write(desc, buf, len);
|
|
if (ret < 1)
|
|
bail("write error");
|
|
buf += ret;
|
|
len -= ret;
|
|
}
|
|
}
|
|
|
|
/* a flag variable for communication between two threads */
|
|
struct flag {
|
|
int value; /* value of flag */
|
|
pthread_mutex_t lock; /* lock for checking and changing flag */
|
|
pthread_cond_t cond; /* condition for signaling on flag change */
|
|
};
|
|
|
|
/* initialize a flag for use, starting with value val */
|
|
local void flag_init(struct flag *me, int val)
|
|
{
|
|
me->value = val;
|
|
pthread_mutex_init(&(me->lock), NULL);
|
|
pthread_cond_init(&(me->cond), NULL);
|
|
}
|
|
|
|
/* set the flag to val, signal another process that may be waiting for it */
|
|
local void flag_set(struct flag *me, int val)
|
|
{
|
|
pthread_mutex_lock(&(me->lock));
|
|
me->value = val;
|
|
pthread_cond_signal(&(me->cond));
|
|
pthread_mutex_unlock(&(me->lock));
|
|
}
|
|
|
|
/* if it isn't already, wait for some other thread to set the flag to val */
|
|
local void flag_wait(struct flag *me, int val)
|
|
{
|
|
pthread_mutex_lock(&(me->lock));
|
|
while (me->value != val)
|
|
pthread_cond_wait(&(me->cond), &(me->lock));
|
|
pthread_mutex_unlock(&(me->lock));
|
|
}
|
|
|
|
/* if flag is equal to val, wait for some other thread to change it */
|
|
local void flag_wait_not(struct flag *me, int val)
|
|
{
|
|
pthread_mutex_lock(&(me->lock));
|
|
while (me->value == val)
|
|
pthread_cond_wait(&(me->cond), &(me->lock));
|
|
pthread_mutex_unlock(&(me->lock));
|
|
}
|
|
|
|
/* clean up the flag when done with it */
|
|
local void flag_done(struct flag *me)
|
|
{
|
|
pthread_cond_destroy(&(me->cond));
|
|
pthread_mutex_destroy(&(me->lock));
|
|
}
|
|
|
|
/* a unit of work to feed to compress_thread() -- it is assumed that the out
|
|
buffer is large enough to hold the maximum size len bytes could deflate to,
|
|
plus five bytes for the final sync marker */
|
|
struct work {
|
|
size_t len; /* length of input */
|
|
unsigned long crc; /* crc of input */
|
|
unsigned char *buf; /* input */
|
|
unsigned char *out; /* space for output (guaranteed big enough) */
|
|
z_stream strm; /* pre-initialized z_stream */
|
|
struct flag busy; /* busy flag indicating work unit in use */
|
|
pthread_t comp; /* this compression thread */
|
|
};
|
|
|
|
/* busy flag values */
|
|
#define IDLE 0 /* compress and writing done -- can start compress */
|
|
#define COMP 1 /* compress -- input and output buffers in use */
|
|
#define WRITE 2 /* compress done, writing output -- can read input */
|
|
|
|
/* read-only globals (set by main/read thread before others started) */
|
|
local int ind; /* input file descriptor */
|
|
local int outd; /* output file descriptor */
|
|
local int level; /* compression level */
|
|
local int procs; /* number of compression threads (>= 2) */
|
|
local size_t size; /* uncompressed input size per thread (>= 32K) */
|
|
local struct work *jobs; /* work units: jobs[0..procs-1] */
|
|
|
|
/* next and previous jobs[] indices */
|
|
#define NEXT(n) ((n) == procs - 1 ? 0 : (n) + 1)
|
|
#define PREV(n) ((n) == 0 ? procs - 1 : (n) - 1)
|
|
|
|
/* sliding dictionary size for deflate */
|
|
#define DICT 32768U
|
|
|
|
/* largest power of 2 that fits in an unsigned int -- used to limit requests
|
|
to zlib functions that use unsigned int lengths */
|
|
#define MAX ((((unsigned)-1) >> 1) + 1)
|
|
|
|
/* compress thread: compress the input in the provided work unit and compute
|
|
its crc -- assume that the amount of space at job->out is guaranteed to be
|
|
enough for the compressed output, as determined by the maximum expansion
|
|
of deflate compression -- use the input in the previous work unit (if there
|
|
is one) to set the deflate dictionary for better compression */
|
|
local void *compress_thread(void *arg)
|
|
{
|
|
size_t len; /* input length for this work unit */
|
|
unsigned long crc; /* crc of input data */
|
|
struct work *prev; /* previous work unit */
|
|
struct work *job = arg; /* work unit for this thread */
|
|
z_stream *strm = &(job->strm); /* zlib stream for this work unit */
|
|
|
|
/* reset state for a new compressed stream */
|
|
(void)deflateReset(strm);
|
|
|
|
/* initialize input, output, and crc */
|
|
strm->next_in = job->buf;
|
|
strm->next_out = job->out;
|
|
len = job->len;
|
|
crc = crc32(0L, Z_NULL, 0);
|
|
|
|
/* set dictionary if this isn't the first work unit, and if we will be
|
|
compressing something (the read thread assures that the dictionary
|
|
data in the previous work unit is still there) */
|
|
prev = jobs + PREV(job - jobs);
|
|
if (prev->buf != NULL && len != 0)
|
|
deflateSetDictionary(strm, prev->buf + (size - DICT), DICT);
|
|
|
|
/* run MAX-sized amounts of input through deflate and crc32 -- this loop
|
|
is needed for those cases where the integer type is smaller than the
|
|
size_t type, or when len is close to the limit of the size_t type */
|
|
while (len > MAX) {
|
|
strm->avail_in = MAX;
|
|
strm->avail_out = (unsigned)-1;
|
|
crc = crc32(crc, strm->next_in, strm->avail_in);
|
|
(void)deflate(strm, Z_NO_FLUSH);
|
|
len -= MAX;
|
|
}
|
|
|
|
/* run last piece through deflate and crc32, follow with a sync marker */
|
|
if (len) {
|
|
strm->avail_in = len;
|
|
strm->avail_out = (unsigned)-1;
|
|
crc = crc32(crc, strm->next_in, strm->avail_in);
|
|
(void)deflate(strm, Z_SYNC_FLUSH);
|
|
}
|
|
|
|
/* don't need to Z_FINISH, since we'd delete the last two bytes anyway */
|
|
|
|
/* return result */
|
|
job->crc = crc;
|
|
return NULL;
|
|
}
|
|
|
|
/* put a 4-byte integer into a byte array in LSB order */
|
|
#define PUT4(a,b) (*(a)=(b),(a)[1]=(b)>>8,(a)[2]=(b)>>16,(a)[3]=(b)>>24)
|
|
|
|
/* write thread: wait for compression threads to complete, write output in
|
|
order, also write gzip header and trailer around the compressed data */
|
|
local void *write_thread(void *arg)
|
|
{
|
|
int n; /* compress thread index */
|
|
size_t len; /* length of input processed */
|
|
unsigned long tot; /* total uncompressed size (overflow ok) */
|
|
unsigned long crc; /* CRC-32 of uncompressed data */
|
|
unsigned char wrap[10]; /* gzip header or trailer */
|
|
|
|
/* write simple gzip header */
|
|
memcpy(wrap, "\037\213\10\0\0\0\0\0\0\3", 10);
|
|
wrap[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
|
|
writen(outd, wrap, 10);
|
|
|
|
/* process output of compress threads until end of input */
|
|
tot = 0;
|
|
crc = crc32(0L, Z_NULL, 0);
|
|
n = 0;
|
|
do {
|
|
/* wait for compress thread to start, then wait to complete */
|
|
flag_wait(&(jobs[n].busy), COMP);
|
|
pthread_join(jobs[n].comp, NULL);
|
|
|
|
/* now that compress is done, allow read thread to use input buffer */
|
|
flag_set(&(jobs[n].busy), WRITE);
|
|
|
|
/* write compressed data and update length and crc */
|
|
writen(outd, jobs[n].out, jobs[n].strm.next_out - jobs[n].out);
|
|
len = jobs[n].len;
|
|
tot += len;
|
|
crc = crc32_combine(crc, jobs[n].crc, len);
|
|
|
|
/* release this work unit and go to the next work unit */
|
|
flag_set(&(jobs[n].busy), IDLE);
|
|
n = NEXT(n);
|
|
|
|
/* an input buffer less than size in length indicates end of input */
|
|
} while (len == size);
|
|
|
|
/* write final static block and gzip trailer (crc and len mod 2^32) */
|
|
wrap[0] = 3; wrap[1] = 0;
|
|
PUT4(wrap + 2, crc);
|
|
PUT4(wrap + 6, tot);
|
|
writen(outd, wrap, 10);
|
|
return NULL;
|
|
}
|
|
|
|
/* one-time initialization of a work unit -- this is where we set the deflate
|
|
compression level and request raw deflate, and also where we set the size
|
|
of the output buffer to guarantee enough space for a worst-case deflate
|
|
ending with a Z_SYNC_FLUSH */
|
|
local void job_init(struct work *job)
|
|
{
|
|
int ret; /* deflateInit2() return value */
|
|
|
|
job->buf = malloc(size);
|
|
job->out = malloc(size + (size >> 11) + 10);
|
|
job->strm.zfree = Z_NULL;
|
|
job->strm.zalloc = Z_NULL;
|
|
job->strm.opaque = Z_NULL;
|
|
ret = deflateInit2(&(job->strm), level, Z_DEFLATED, -15, 8,
|
|
Z_DEFAULT_STRATEGY);
|
|
if (job->buf == NULL || job->out == NULL || ret != Z_OK)
|
|
bail("not enough memory");
|
|
}
|
|
|
|
/* compress ind to outd in the gzip format, using multiple threads for the
|
|
compression and crc calculation and another thread for writing the output --
|
|
the read thread is the main thread */
|
|
local void read_thread(void)
|
|
{
|
|
int n; /* general index */
|
|
size_t got; /* amount read */
|
|
pthread_attr_t attr; /* thread attributes (left at defaults) */
|
|
pthread_t write; /* write thread */
|
|
|
|
/* set defaults (not all pthread implementations default to joinable) */
|
|
pthread_attr_init(&attr);
|
|
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
|
|
|
/* allocate and set up work list (individual work units will be initialized
|
|
as needed, in case the input is short), assure that allocation size
|
|
arithmetic does not overflow */
|
|
if (size + (size >> 11) + 10 < (size >> 11) + 10 ||
|
|
(ssize_t)(size + (size >> 11) + 10) < 0 ||
|
|
((size_t)0 - 1) / procs <= sizeof(struct work) ||
|
|
(jobs = malloc(procs * sizeof(struct work))) == NULL)
|
|
bail("not enough memory");
|
|
for (n = 0; n < procs; n++) {
|
|
jobs[n].buf = NULL;
|
|
flag_init(&(jobs[n].busy), IDLE);
|
|
}
|
|
|
|
/* start write thread */
|
|
pthread_create(&write, &attr, write_thread, NULL);
|
|
|
|
/* read from input and start compress threads (write thread will pick up
|
|
the output of the compress threads) */
|
|
n = 0;
|
|
do {
|
|
/* initialize this work unit if it's the first time it's used */
|
|
if (jobs[n].buf == NULL)
|
|
job_init(jobs + n);
|
|
|
|
/* read input data, but wait for last compress on this work unit to be
|
|
done, and wait for the dictionary to be used by the last compress on
|
|
the next work unit */
|
|
flag_wait_not(&(jobs[n].busy), COMP);
|
|
flag_wait_not(&(jobs[NEXT(n)].busy), COMP);
|
|
got = readn(ind, jobs[n].buf, size);
|
|
|
|
/* start compress thread, but wait for write to be done first */
|
|
flag_wait(&(jobs[n].busy), IDLE);
|
|
jobs[n].len = got;
|
|
pthread_create(&(jobs[n].comp), &attr, compress_thread, jobs + n);
|
|
|
|
/* mark work unit so write thread knows compress was started */
|
|
flag_set(&(jobs[n].busy), COMP);
|
|
|
|
/* go to the next work unit */
|
|
n = NEXT(n);
|
|
|
|
/* do until end of input, indicated by a read less than size */
|
|
} while (got == size);
|
|
|
|
/* wait for the write thread to complete -- the write thread will join with
|
|
all of the compress threads, so this waits for all of the threads to
|
|
complete */
|
|
pthread_join(write, NULL);
|
|
|
|
/* free up all requested resources and return */
|
|
for (n = procs - 1; n >= 0; n--) {
|
|
flag_done(&(jobs[n].busy));
|
|
(void)deflateEnd(&(jobs[n].strm));
|
|
free(jobs[n].out);
|
|
free(jobs[n].buf);
|
|
}
|
|
free(jobs);
|
|
pthread_attr_destroy(&attr);
|
|
}
|
|
|
|
/* Process arguments for level, size, and procs, compress from stdin to
|
|
stdout in the gzip format. Note that procs must be at least two in
|
|
order to provide a dictionary in one work unit for the other work
|
|
unit, and that size must be at least 32K to store a full dictionary. */
|
|
int main(int argc, char **argv)
|
|
{
|
|
int n; /* general index */
|
|
int get; /* command line parameters to get */
|
|
char *arg; /* command line argument */
|
|
|
|
/* set defaults -- 32 processes and 128K buffers was found to provide
|
|
good utilization of four cores (about 97%) and balanced the overall
|
|
execution time impact of more threads against more dictionary
|
|
processing for a fixed amount of memory -- the memory usage for these
|
|
settings and full use of all work units (at least 4 MB of input) is
|
|
16.2 MB
|
|
*/
|
|
level = Z_DEFAULT_COMPRESSION;
|
|
procs = 32;
|
|
size = 131072UL;
|
|
|
|
/* process command-line arguments */
|
|
get = 0;
|
|
for (n = 1; n < argc; n++) {
|
|
arg = argv[n];
|
|
if (*arg == '-') {
|
|
while (*++arg)
|
|
if (*arg >= '0' && *arg <= '9') /* compression level */
|
|
level = *arg - '0';
|
|
else if (*arg == 'b') /* chunk size in K */
|
|
get |= 1;
|
|
else if (*arg == 'p') /* number of processes */
|
|
get |= 2;
|
|
else if (*arg == 'h') { /* help */
|
|
fputs("usage: pigz [-0..9] [-b blocksizeinK]", stderr);
|
|
fputs(" [-p processes] < foo > foo.gz\n", stderr);
|
|
return 0;
|
|
}
|
|
else
|
|
bail("invalid option");
|
|
}
|
|
else if (get & 1) {
|
|
if (get & 2)
|
|
bail("you need to separate the -b and -p options");
|
|
size = (size_t)(atol(arg)) << 10; /* chunk size */
|
|
if (size < DICT)
|
|
bail("invalid option");
|
|
get = 0;
|
|
}
|
|
else if (get & 2) {
|
|
procs = atoi(arg); /* processes */
|
|
if (procs < 2)
|
|
bail("invalid option");
|
|
get = 0;
|
|
}
|
|
else
|
|
bail("invalid option (you need to pipe input and output)");
|
|
}
|
|
if (get)
|
|
bail("missing option argument");
|
|
|
|
/* do parallel compression from stdin to stdout (the read thread starts up
|
|
the write thread and the compression threads, and they all join before
|
|
the read thread returns) */
|
|
ind = 0;
|
|
outd = 1;
|
|
read_thread();
|
|
|
|
/* done */
|
|
return 0;
|
|
}
|