summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHristo Venev <hristo@venev.name>2020-05-28 21:45:44 +0000
committerHristo Venev <hristo@venev.name>2020-05-28 21:50:29 +0000
commit586990981a2cf0b4522451178ac0882967811149 (patch)
treee6406a5955b27c00bc1c342d0f2606f3094c61b3
Initial commit.
-rw-r--r--.gitignore2
-rw-r--r--Makefile6
-rw-r--r--compress.c231
-rw-r--r--uncompress.c51
4 files changed, 290 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..00d2c92
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/compress
+/uncompress
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..76aacaf
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,6 @@
+CFLAGS?=-O2
+
+all: compress uncompress
+
+%: %.c
+ $(CC) -Wall -Wextra $(CFLAGS) $< -o $@
diff --git a/compress.c b/compress.c
new file mode 100644
index 0000000..3accc9a
--- /dev/null
+++ b/compress.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+static ssize_t read_data(FILE *in, uint8_t **pdata) {
+ size_t alloc = 4096;
+ uint8_t *data = malloc(alloc);
+ if(!data) return -1;
+
+ size_t len = 0;
+ while(1) {
+ size_t n = fread_unlocked(data + len, 1, alloc - len, in);
+ if(n == 0) break;
+ len += n;
+ size_t want_min = len + (len >> 2);
+ if(alloc < want_min) {
+ size_t a2 = len + (len >> 1);
+ uint8_t *r = realloc(data, a2);
+ if(!r) goto fail;
+ data = r;
+ alloc = a2;
+ }
+ }
+ if(ferror_unlocked(in)) goto fail;
+
+ uint8_t *r = realloc(data, len);
+ if(r) data = r;
+
+ *pdata = data;
+ return len;
+
+fail:;
+ int e = errno;
+ free(data);
+ errno = e;
+ return -1;
+}
+
+struct backref {
+ size_t cost;
+ uint8_t mlen, moff;
+};
+
+struct node {
+ size_t off, len;
+ struct node *next[256];
+};
+
+static void free_nodes(struct node **p) {
+ for(size_t i = 0; i < 256; i++) {
+ struct node *n = p[i];
+ if(!n) continue;
+ free_nodes(n->next);
+ free(n);
+ }
+}
+
+inline static void init_nodes(struct node **p) {
+ memset(p, 0, 256 * sizeof(*p));
+}
+
+static struct backref *compress_data(const uint8_t *data, size_t len) {
+ struct node *root[256];
+ init_nodes(root);
+
+ if(len >= (size_t)-1 / sizeof(struct backref) - 1) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ struct backref *best = malloc((len + 1) * sizeof(struct backref));
+ if(!best) {
+ return NULL;
+ }
+
+ best[0] = (struct backref){
+ .cost = 0,
+ };
+
+ for(size_t suff_len = 1; suff_len <= len; suff_len++) {
+ size_t min_cost = best[suff_len - 1].cost + 9;
+ uint8_t min_mlen = 0;
+ uint8_t min_moff = 0;
+
+ size_t at = suff_len;
+ size_t bound = at < 256 ? 0 : at - 256;
+ struct node **p = root;
+ while(at > bound) {
+ size_t left = at - bound;
+
+ p += data[at - 1];
+ struct node *n = *p;
+ size_t mlen;
+ if(!n) {
+ n = calloc(1, sizeof(struct node));
+ if(!n) goto fail;
+ mlen = left;
+ *p = n;
+ } else {
+ assert(n->len > 0);
+ assert(n->off < at);
+ size_t moff = at - n->off;
+ if(moff > 256) {
+ free_nodes(n->next);
+ init_nodes(n->next);
+ mlen = left;
+ } else {
+ const uint8_t *cmp1 = data + at;
+ const uint8_t *cmp2 = data + n->off;
+ if(n->len < left) left = n->len;
+ mlen = 0;
+ while(mlen < left) {
+ if(*--cmp1 != *--cmp2) break;
+ mlen++;
+ }
+ assert(mlen > 0);
+
+ for(size_t i = mlen; i > 0; i--) {
+ size_t base = at - i;
+ size_t cost = best[base].cost + 17;
+ assert(suff_len - base <= 256);
+ if(cost < min_cost) {
+ assert(suff_len - base > 1);
+ min_cost = cost;
+ min_mlen = suff_len - base - 1;
+ min_moff = moff - 1;
+ }
+ }
+
+ if(mlen != n->len) {
+ n->len -= mlen;
+ n->off -= mlen;
+ struct node *n2 = calloc(1, sizeof(struct node));
+ if(!n2) goto fail;
+ n2->next[*cmp2] = n;
+ n = n2;
+ *p = n;
+ }
+ }
+ }
+ n->off = at;
+ n->len = mlen;
+ p = n->next;
+ at -= mlen;
+ }
+
+ best[suff_len] = (struct backref){
+ .cost = min_cost,
+ .mlen = min_mlen,
+ .moff = min_moff,
+ };
+ }
+
+ free_nodes(root);
+ return best;
+
+fail:;
+ int e = errno;
+ free_nodes(root);
+ errno = e;
+ return NULL;
+}
+
+static int reconstruct(FILE *out, const uint8_t *data, size_t len, struct backref *back) {
+ size_t cost = back[len].cost;
+ size_t bytes = (cost + 7) / 8;
+ uint8_t *buf = malloc(bytes);
+ if(!buf) return -1;
+
+ uint8_t *at = buf + bytes;
+ uint8_t cmd = 0;
+ while(len) {
+ assert(cost == back[len].cost);
+ cmd <<= 1;
+ if(back[len].mlen == 0) {
+ cost -= 9;
+ *--at = data[--len];
+ } else {
+ cmd |= 1;
+ cost -= 17;
+ *--at = back[len].mlen;
+ *--at = back[len].moff;
+ len -= (size_t)back[len].mlen + 1;
+ }
+ if(cost % 8 == 0) {
+ *--at = cmd;
+ cmd = 0;
+ }
+ }
+ assert(cost == 0);
+ assert(at == buf);
+
+ at = buf;
+ while(bytes) {
+ size_t n = fwrite_unlocked(buf, 1, bytes, out);
+ if(n == 0) {
+ int e = errno;
+ free(buf);
+ errno = e;
+ return -1;
+ }
+ at += n;
+ bytes -= n;
+ }
+ free(buf);
+ return 0;
+}
+
+int main() {
+ uint8_t *data;
+ ssize_t n = read_data(stdin, &data);
+ if(n < 0) {
+ fprintf(stderr, "Read Error: %m\n");
+ return 1;
+ }
+
+ struct backref *best = compress_data(data, n);
+
+ int r = reconstruct(stdout, data, n, best);
+ if(r < 0) {
+ fprintf(stderr, "Write Error: %m\n");
+ return 1;
+ }
+ free(best);
+ free(data);
+ return 0;
+}
diff --git a/uncompress.c b/uncompress.c
new file mode 100644
index 0000000..3ef83ec
--- /dev/null
+++ b/uncompress.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#include <stdio.h>
+#include <stdint.h>
+
+int main() {
+ FILE *in = stdin;
+ FILE *out = stdout;
+
+ int c;
+ uint8_t out_i = 0;
+ char outbuf[256];
+ while(1) {
+ c = fgetc_unlocked(in);
+ if(c < 0) goto end;
+ uint8_t cmd = c;
+ for(size_t i = 0; i < 8; i++) {
+ if(cmd & 1) {
+ c = fgetc_unlocked(in);
+ if(c < 0) goto fail;
+ uint8_t offset = c;
+ offset += 1;
+ c = fgetc_unlocked(in);
+ if(c < 0) goto fail;
+ uint8_t len = c;
+ do {
+ c = outbuf[(uint8_t)(out_i - offset)];
+ outbuf[out_i++] = c;
+ c = fputc_unlocked(c, out);
+ if(c < 0) goto writefail;
+ } while(len--);
+ } else {
+ c = fgetc_unlocked(in);
+ if(c < 0) goto end;
+ outbuf[out_i++] = c;
+ c = fputc_unlocked(c, out);
+ if(c < 0) goto writefail;
+ }
+ cmd >>= 1;
+ }
+ }
+
+end:
+ if(feof_unlocked(in)) return 0;
+fail:
+ fprintf(stderr, ferror_unlocked(in) ? "Read Error: %m\n" : "Unexpected EOF\n");
+ return 1;
+writefail:
+ fprintf(stderr, "Write Error: %m\n");
+ return 1;
+}