From 586990981a2cf0b4522451178ac0882967811149 Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Thu, 28 May 2020 21:45:44 +0000 Subject: Initial commit. --- compress.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 compress.c (limited to 'compress.c') diff --git a/compress.c b/compress.c new file mode 100644 index 0000000..3accc9a --- /dev/null +++ b/compress.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include +#include +#include +#include +#include +#include + +static ssize_t read_data(FILE *in, uint8_t **pdata) { + size_t alloc = 4096; + uint8_t *data = malloc(alloc); + if(!data) return -1; + + size_t len = 0; + while(1) { + size_t n = fread_unlocked(data + len, 1, alloc - len, in); + if(n == 0) break; + len += n; + size_t want_min = len + (len >> 2); + if(alloc < want_min) { + size_t a2 = len + (len >> 1); + uint8_t *r = realloc(data, a2); + if(!r) goto fail; + data = r; + alloc = a2; + } + } + if(ferror_unlocked(in)) goto fail; + + uint8_t *r = realloc(data, len); + if(r) data = r; + + *pdata = data; + return len; + +fail:; + int e = errno; + free(data); + errno = e; + return -1; +} + +struct backref { + size_t cost; + uint8_t mlen, moff; +}; + +struct node { + size_t off, len; + struct node *next[256]; +}; + +static void free_nodes(struct node **p) { + for(size_t i = 0; i < 256; i++) { + struct node *n = p[i]; + if(!n) continue; + free_nodes(n->next); + free(n); + } +} + +inline static void init_nodes(struct node **p) { + memset(p, 0, 256 * sizeof(*p)); +} + +static struct backref *compress_data(const uint8_t *data, size_t len) { + struct node *root[256]; + init_nodes(root); + + if(len >= (size_t)-1 / sizeof(struct backref) - 1) { + errno = ENOMEM; + return NULL; + } + struct backref *best = malloc((len + 1) * sizeof(struct backref)); + if(!best) { + return NULL; + } + + best[0] = (struct backref){ + .cost = 0, + }; + + for(size_t suff_len = 1; suff_len <= len; suff_len++) { + size_t min_cost = best[suff_len - 1].cost + 9; + uint8_t min_mlen = 0; + uint8_t min_moff = 0; + + size_t at = suff_len; + size_t bound = at < 256 ? 0 : at - 256; + struct node **p = root; + while(at > bound) { + size_t left = at - bound; + + p += data[at - 1]; + struct node *n = *p; + size_t mlen; + if(!n) { + n = calloc(1, sizeof(struct node)); + if(!n) goto fail; + mlen = left; + *p = n; + } else { + assert(n->len > 0); + assert(n->off < at); + size_t moff = at - n->off; + if(moff > 256) { + free_nodes(n->next); + init_nodes(n->next); + mlen = left; + } else { + const uint8_t *cmp1 = data + at; + const uint8_t *cmp2 = data + n->off; + if(n->len < left) left = n->len; + mlen = 0; + while(mlen < left) { + if(*--cmp1 != *--cmp2) break; + mlen++; + } + assert(mlen > 0); + + for(size_t i = mlen; i > 0; i--) { + size_t base = at - i; + size_t cost = best[base].cost + 17; + assert(suff_len - base <= 256); + if(cost < min_cost) { + assert(suff_len - base > 1); + min_cost = cost; + min_mlen = suff_len - base - 1; + min_moff = moff - 1; + } + } + + if(mlen != n->len) { + n->len -= mlen; + n->off -= mlen; + struct node *n2 = calloc(1, sizeof(struct node)); + if(!n2) goto fail; + n2->next[*cmp2] = n; + n = n2; + *p = n; + } + } + } + n->off = at; + n->len = mlen; + p = n->next; + at -= mlen; + } + + best[suff_len] = (struct backref){ + .cost = min_cost, + .mlen = min_mlen, + .moff = min_moff, + }; + } + + free_nodes(root); + return best; + +fail:; + int e = errno; + free_nodes(root); + errno = e; + return NULL; +} + +static int reconstruct(FILE *out, const uint8_t *data, size_t len, struct backref *back) { + size_t cost = back[len].cost; + size_t bytes = (cost + 7) / 8; + uint8_t *buf = malloc(bytes); + if(!buf) return -1; + + uint8_t *at = buf + bytes; + uint8_t cmd = 0; + while(len) { + assert(cost == back[len].cost); + cmd <<= 1; + if(back[len].mlen == 0) { + cost -= 9; + *--at = data[--len]; + } else { + cmd |= 1; + cost -= 17; + *--at = back[len].mlen; + *--at = back[len].moff; + len -= (size_t)back[len].mlen + 1; + } + if(cost % 8 == 0) { + *--at = cmd; + cmd = 0; + } + } + assert(cost == 0); + assert(at == buf); + + at = buf; + while(bytes) { + size_t n = fwrite_unlocked(buf, 1, bytes, out); + if(n == 0) { + int e = errno; + free(buf); + errno = e; + return -1; + } + at += n; + bytes -= n; + } + free(buf); + return 0; +} + +int main() { + uint8_t *data; + ssize_t n = read_data(stdin, &data); + if(n < 0) { + fprintf(stderr, "Read Error: %m\n"); + return 1; + } + + struct backref *best = compress_data(data, n); + + int r = reconstruct(stdout, data, n, best); + if(r < 0) { + fprintf(stderr, "Write Error: %m\n"); + return 1; + } + free(best); + free(data); + return 0; +} -- cgit