summaryrefslogblamecommitdiff
path: root/compress.c
blob: 59bd35cf62416529519ac6fb0dc97cfdbf57adf5 (plain) (tree)






























                                                                          

                                






































































































































































































                                                                                                 
// SPDX-License-Identifier: LGPL-3.0-or-later

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>

static ssize_t read_data(FILE *in, uint8_t **pdata) {
	size_t alloc = 4096;
	uint8_t *data = malloc(alloc);
	if(!data) return -1;

	size_t len = 0;
	while(1) {
		size_t n = fread_unlocked(data + len, 1, alloc - len, in);
		if(n == 0) break;
		len += n;
		size_t want_min = len + (len >> 2);
		if(alloc < want_min) {
			size_t a2 = len + (len >> 1);
			uint8_t *r = realloc(data, a2);
			if(!r) goto fail;
			data = r;
			alloc = a2;
		}
	}
	if(ferror_unlocked(in)) goto fail;

	uint8_t *r = realloc(data, len);
	if(len && !r) goto fail;
	data = r;

	*pdata = data;
	return len;

fail:;
	int e = errno;
	free(data);
	errno = e;
	return -1;
}

struct backref {
	size_t cost;
	uint8_t mlen, moff;
};

struct node {
	size_t off, len;
	struct node *next[256];
};

static void free_nodes(struct node **p) {
	for(size_t i = 0; i < 256; i++) {
		struct node *n = p[i];
		if(!n) continue;
		free_nodes(n->next);
		free(n);
	}
}

inline static void init_nodes(struct node **p) {
	memset(p, 0, 256 * sizeof(*p));
}

static struct backref *compress_data(const uint8_t *data, size_t len) {
	struct node *root[256];
	init_nodes(root);

	if(len >= (size_t)-1 / sizeof(struct backref) - 1) {
		errno = ENOMEM;
		return NULL;
	}
	struct backref *best = malloc((len + 1) * sizeof(struct backref));
	if(!best) {
		return NULL;
	}

	best[0] = (struct backref){
		.cost = 0,
	};

	for(size_t suff_len = 1; suff_len <= len; suff_len++) {
		size_t min_cost = best[suff_len - 1].cost + 9;
		uint8_t min_mlen = 0;
		uint8_t min_moff = 0;

		size_t at = suff_len;
		size_t bound = at < 256 ? 0 : at - 256;
		struct node **p = root;
		while(at > bound) {
			size_t left = at - bound;

			p += data[at - 1];
			struct node *n = *p;
			size_t mlen;
			if(!n) {
				n = calloc(1, sizeof(struct node));
				if(!n) goto fail;
				mlen = left;
				*p = n;
			} else {
				assert(n->len > 0);
				assert(n->off < at);
				size_t moff = at - n->off;
				if(moff > 256) {
					free_nodes(n->next);
					init_nodes(n->next);
					mlen = left;
				} else {
					const uint8_t *cmp1 = data + at;
					const uint8_t *cmp2 = data + n->off;
					if(n->len < left) left = n->len;
					mlen = 0;
					while(mlen < left) {
						if(*--cmp1 != *--cmp2) break;
						mlen++;
					}
					assert(mlen > 0);

					for(size_t i = mlen; i > 0; i--) {
						size_t base = at - i;
						size_t cost = best[base].cost + 17;
						assert(suff_len - base <= 256);
						if(cost < min_cost) {
							assert(suff_len - base > 1);
							min_cost = cost;
							min_mlen = suff_len - base - 1;
							min_moff = moff - 1;
						}
					}

					if(mlen != n->len) {
						n->len -= mlen;
						n->off -= mlen;
						struct node *n2 = calloc(1, sizeof(struct node));
						if(!n2) goto fail;
						n2->next[*cmp2] = n;
						n = n2;
						*p = n;
					}
				}
			}
			n->off = at;
			n->len = mlen;
			p = n->next;
			at -= mlen;
		}

		best[suff_len] = (struct backref){
			.cost = min_cost,
			.mlen = min_mlen,
			.moff = min_moff,
		};
	}

	free_nodes(root);
	return best;

fail:;
	int e = errno;
	free_nodes(root);
	errno = e;
	return NULL;
}

static int reconstruct(FILE *out, const uint8_t *data, size_t len, struct backref *back) {
	size_t cost = back[len].cost;
	size_t bytes = (cost + 7) / 8;
	uint8_t *buf = malloc(bytes);
	if(!buf) return -1;

	uint8_t *at = buf + bytes;
	uint8_t cmd = 0;
	while(len) {
		assert(cost == back[len].cost);
		cmd <<= 1;
		if(back[len].mlen == 0) {
			cost -= 9;
			*--at = data[--len];
		} else {
			cmd |= 1;
			cost -= 17;
			*--at = back[len].mlen;
			*--at = back[len].moff;
			len -= (size_t)back[len].mlen + 1;
		}
		if(cost % 8 == 0) {
			*--at = cmd;
			cmd = 0;
		}
	}
	assert(cost == 0);
	assert(at == buf);

	at = buf;
	while(bytes) {
		size_t n = fwrite_unlocked(buf, 1, bytes, out);
		if(n == 0) {
			int e = errno;
			free(buf);
			errno = e;
			return -1;
		}
		at += n;
		bytes -= n;
	}
	free(buf);
	return 0;
}

int main() {
	uint8_t *data;
	ssize_t n = read_data(stdin, &data);
	if(n < 0) {
		fprintf(stderr, "Read Error: %m\n");
		return 1;
	}

	struct backref *best = compress_data(data, n);

	int r = reconstruct(stdout, data, n, best);
	if(r < 0) {
		fprintf(stderr, "Write Error: %m\n");
		return 1;
	}
	free(best);
	free(data);
	return 0;
}