aboutsummaryrefslogtreecommitdiff
path: root/pgbak.c
diff options
context:
space:
mode:
authorHristo Venev <hristo@venev.name>2023-10-17 23:40:52 +0300
committerHristo Venev <hristo@venev.name>2023-10-24 21:10:16 +0300
commit01fabfa25f92424e35ecb4ae98e95a06aa2e278f (patch)
tree0b5c2d6711dd54556ad7ef3875b691aadf6ed0e3 /pgbak.c
parent42d544a9c03aa4b682189f2274c5c1bea346d635 (diff)
Improve backup consistency during snapshot
We now create the snapshot as "next" and only promote it to "current" once it finishes. WAL files are linked to both current and next.
Diffstat (limited to 'pgbak.c')
-rw-r--r--pgbak.c288
1 files changed, 186 insertions, 102 deletions
diff --git a/pgbak.c b/pgbak.c
index 8fa265f..876fff9 100644
--- a/pgbak.c
+++ b/pgbak.c
@@ -48,13 +48,25 @@ static const char *const LOCK_KIND[] = {
// ===== State flags =====
-#define STATE_SHIFT 4 // low=pending, high=running
-#define STATE_MASK 0x0f
+#define STATE_MASK 0x7
// We need to run the backup script.
-#define STATE_ARCHIVE 0x01
-// A new full snapshot needs to be made.
-#define STATE_SNAPSHOT 0x02
+#define S_WANT_ARCHIVE 0x08
+// A "next" full snapshot should be made unless one is in progress.
+#define S_WANT_SNAPSHOT 0x10
+// A "next" full snapshot needs to be made.
+#define S_WANT_NEW_SNAPSHOT 0x20
+
+// Done.
+#define STATE_IDLE 0x0
+// Run the backup script, goto STATE_IDLE
+#define STATE_ARCHIVE 0x1
+// Rename next->current, goto STATE_ARCHIVE
+#define STATE_PROMOTE 0x2
+// Run the backup script, goto STATE_PROMOTE
+#define STATE_PREARCHIVE 0x3
+// Take a new snapshot, goto STATE_PREARCHIVE
+#define STATE_SNAPSHOT 0x4
static void lck_release(int pos) {
@@ -196,15 +208,6 @@ static int cld_pipe(int in_fd, int out_fd, const char *const *argv) {
static int bak_snapshot(char *name) {
- {
- struct timespec ts;
- if(clock_gettime(CLOCK_REALTIME, &ts) < 0) {
- fprintf(stderr, "clock_gettime(CLOCK_REALTIME): %m\n");
- abort();
- }
- sprintf(name, "%lld", (long long)ts.tv_sec);
- }
-
if(mkdirat(backup_dfd, name, 0700) < 0) {
fprintf(stderr, "Failed to create $PGBAK/%s: %m\n", name);
return -1;
@@ -228,9 +231,9 @@ static int bak_snapshot(char *name) {
goto fail_clean;
}
- (void)unlinkat(backup_dfd, "current", 0);
- if(symlinkat(name, backup_dfd, "current") < 0) {
- fprintf(stderr, "Failed to update $PGBAK/current: %m\n");
+ (void)unlinkat(backup_dfd, "next", 0);
+ if(symlinkat(name, backup_dfd, "next") < 0) {
+ fprintf(stderr, "Failed to update $PGBAK/next: %m\n");
goto fail;
}
@@ -266,9 +269,12 @@ static int bak_snapshot(char *name) {
fprintf(stderr, "fsync(): %m\n");
abort();
}
- return subdfd;
+ close(subdfd);
+
+ return 0;
fail:
+ (void)unlinkat(backup_dfd, "next", 0);
close(out_fd);
fail_clean:
(void)unlinkat(subdfd, "pg_wal", AT_REMOVEDIR);
@@ -283,11 +289,11 @@ static bool should_pgbasebackup(int subdfd) {
struct stat st;
if(fstatat(subdfd, BASE_OUT, &st, AT_SYMLINK_NOFOLLOW) < 0) {
- fprintf(stderr, "Failed to stat %s: %m\n", BASE_OUT);
+ fprintf(stderr, "Failed to stat %s: %m; creating new snapshot\n", BASE_OUT);
return true;
}
if(!S_ISREG(st.st_mode)) {
- fprintf(stderr, "Expected regular file at %s\n", BASE_OUT);
+ fprintf(stderr, "Expected regular file at %s; creating new snapshot\n", BASE_OUT);
return true;
}
@@ -297,7 +303,7 @@ static bool should_pgbasebackup(int subdfd) {
int wal_dfd = openat(subdfd, "pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
if(wal_dfd < 0) {
- fprintf(stderr, "Failed to open $PGBAK/current/pg_wal: %m\n");
+ fprintf(stderr, "Failed to open $PGBAK/current/pg_wal: %m; creating new snapshot\n");
return true;
}
@@ -323,90 +329,135 @@ static bool should_pgbasebackup(int subdfd) {
}
closedir(wal_dir);
- return wal_count > 8 && (wal_size >> 1) >= base_size;
+ if(wal_count > 8 && (wal_size >> 1) >= base_size) {
+ fprintf(stderr, "WAL directroy too big; creating new snapshot\n");
+ return true;
+ }
+ return false;
+}
+
+static int bak_open_current(char *name, size_t namebuf) {
+ ssize_t r = readlinkat(backup_dfd, "current", name, namebuf);
+ if(r < 0) {
+ if(errno != ENOENT) {
+ fprintf(stderr, "Failed to readlink() $PGBAK/current: %m\n");
+ return -1;
+ }
+ fprintf(stderr, "Failed to open() $PGBAK/current: %m\n");
+ return -1;
+ }
+ if((size_t)r >= namebuf) {
+ fprintf(stderr, "Symlink $PGBAK/current target too long\n");
+ return -1;
+ }
+ name[r] = 0;
+ int subdfd = openat(backup_dfd, name, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+ if(subdfd < 0) {
+ fprintf(stderr, "Failed to open() $PGBAK/%s: %m\n", name);
+ }
+ return subdfd;
}
// Archiver. Called in a subprocess with LOCK_STATE.
static void bak_work(int state) {
unsigned int backoff = 4 * 512;
- char name[32];
- char ts_s[32];
- const char *const cmd[] = {
- "../scripts/backup", name, ts_s, NULL,
+ char cur_name[32];
+ char ts_name[32];
+ const char *archive_cmd[] = {
+ "../scripts/backup", cur_name, ts_name, NULL, NULL,
};
lck_wait(LOCK_WAIT);
bool did_snapshot = false;
- int subdfd = -1;
bool refresh_ts = true;
+ int flags = state;
+ state &= STATE_MASK;
// Skip reopening if a full snapshot was requested.
while(true) {
- if((state & STATE_ARCHIVE) && subdfd < 0) {
- ssize_t r = readlinkat(backup_dfd, "current", name, sizeof(name));
- if(r < 0) {
- if(errno != ENOENT) {
- fprintf(stderr, "Failed to readlink() $PGBAK/current: %m\n");
- _Exit(1);
- }
+ switch(state) {
+ case STATE_IDLE:
+ if(flags & S_WANT_ARCHIVE) state = STATE_ARCHIVE;
+ [[fallthrough]];
+ case STATE_ARCHIVE:
+ if(flags & S_WANT_SNAPSHOT) state = STATE_SNAPSHOT;
+ [[fallthrough]];
+ case STATE_PROMOTE:
+ case STATE_PREARCHIVE:
+ if(flags & S_WANT_NEW_SNAPSHOT) state = STATE_SNAPSHOT;
+ break;
+ default:
state = STATE_SNAPSHOT;
- } else {
- if((size_t)r >= sizeof(name)) {
- fprintf(stderr, "Symlink $PGBAK/current target too long\n");
- _Exit(1);
- }
- name[r] = 0;
- subdfd = openat(backup_dfd, name, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
- if(subdfd < 0) {
- fprintf(stderr, "Failed to open() $PGBAK/%s: %m\n", name);
- state = STATE_SNAPSHOT;
- }
- }
}
+ lck_write(state);
+ lck_release(LOCK_STATE);
+
+ if(state == STATE_IDLE) break;
- if((state & STATE_ARCHIVE) && refresh_ts) {
+ if(state == STATE_SNAPSHOT || refresh_ts) {
refresh_ts = false;
struct timespec ts;
if(clock_gettime(CLOCK_REALTIME, &ts) < 0) {
fprintf(stderr, "clock_gettime(CLOCK_REALTIME): %m\n");
abort();
}
- sprintf(ts_s, "%lld", (long long)ts.tv_sec);
+ sprintf(ts_name, "%lld", (long long)ts.tv_sec);
}
- lck_write(state << STATE_SHIFT);
- lck_release(LOCK_STATE);
-
const char *failed = NULL;
- if(state & STATE_ARCHIVE) {
- pid_t child = cld_spawn(-1, -1, subdfd, cmd);
- if(cld_wait(child, cmd[0])) {
- failed = "backup script";
- } else {
- fprintf(stderr, "Finished archive %s-%s\n", name, ts_s);
- }
- if(!(state & STATE_SNAPSHOT) && !did_snapshot && should_pgbasebackup(subdfd)) {
- state |= STATE_SNAPSHOT;
- }
- if(!failed) {
- state &= ~STATE_ARCHIVE;
- }
- } else if(state & STATE_SNAPSHOT) {
- if(subdfd >= 0) {
- close(subdfd);
- }
- subdfd = bak_snapshot(name);
- if(subdfd < 0) {
- failed = "pg_basebackup";
- } else {
- fprintf(stderr, "Finished snapshot %s\n", name);
+ switch(state) {
+ case STATE_SNAPSHOT:
+ if(bak_snapshot(ts_name) < 0) {
+ failed = "pg_basebackup";
+ break;
+ }
+ fprintf(stderr, "Finished snapshot %s\n", ts_name);
did_snapshot = true;
- refresh_ts = true;
+ state = STATE_PREARCHIVE;
+ break;
+ case STATE_PROMOTE:
+ if(renameat(backup_dfd, "next", backup_dfd, "current") < 0) {
+ fprintf(stderr, "Failed to promote $PGBAK/next to current: %m\n");
+ state = STATE_SNAPSHOT;
+ break;
+ }
+ if(fsync(backup_dfd) < 0) {
+ fprintf(stderr, "fsync(): %m\n");
+ abort();
+ }
state = STATE_ARCHIVE;
+ break;
+ case STATE_ARCHIVE:
+ case STATE_PREARCHIVE:
+ {
+ int subdfd = bak_open_current(cur_name, sizeof(cur_name));
+ if(subdfd < 0) {
+ if(state == STATE_ARCHIVE) {
+ state = STATE_SNAPSHOT;
+ } else {
+ state = STATE_PROMOTE;
+ }
+ break;
+ }
+ archive_cmd[3] = (state == STATE_PREARCHIVE) ? "old" : NULL;
+ pid_t child = cld_spawn(-1, -1, subdfd, archive_cmd);
+ if(cld_wait(child, archive_cmd[0])) {
+ close(subdfd);
+ failed = "backup script";
+ break;
+ }
+ fprintf(stderr, "Finished archive %s-%s\n", cur_name, ts_name);
+ if(state == STATE_PREARCHIVE) {
+ state = STATE_PROMOTE;
+ } else if(!did_snapshot && should_pgbasebackup(subdfd)) {
+ state = STATE_SNAPSHOT;
+ } else {
+ state = STATE_IDLE;
+ }
+ close(subdfd);
+ break;
}
- } else {
- break;
}
if(failed) {
@@ -427,9 +478,7 @@ static void bak_work(int state) {
}
lck_wait(LOCK_STATE);
- int nstate = lck_read();
- state |= nstate & STATE_MASK;
- if(nstate & STATE_ARCHIVE) refresh_ts = true;
+ flags = lck_read();
}
}
@@ -439,16 +488,27 @@ static int bak_begin(int flags) {
lck_wait(LOCK_STATE);
int state = lck_read();
+ state |= flags;
if(!lck_try(LOCK_BACKUP)) {
if(flags & ~state) {
- lck_write(state | flags);
+ lck_write(state);
}
+ lck_release(LOCK_STATE);
return -1;
}
// We got the backup lock. If anything was running, it must have crashed.
// Mark it as pending.
- return flags | (state & STATE_MASK) | (state >> STATE_SHIFT);
+ switch(state & STATE_MASK) {
+ case STATE_SNAPSHOT:
+ fprintf(stderr, "Resuming interrupted snapshot\n");
+ break;
+ case STATE_PROMOTE:
+ case STATE_ARCHIVE:
+ fprintf(stderr, "Resuming interrupted backup\n");
+ break;
+ }
+ return state;
}
static void on_sigalrm(int) {
@@ -475,32 +535,47 @@ static int cmd_wal(const char *name) {
return 1;
}
- int flags = STATE_ARCHIVE;
- int wal_dfd = openat(backup_dfd, "current/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
- if(wal_dfd < 0) {
- if(errno == ENOENT) {
- fprintf(stderr, "Directory $PGBAK/current/pg_wal does not exist; skipping %s and doing snapshot\n", name);
- goto skip;
+ int flags = S_WANT_ARCHIVE;
+ // current is renamed to next, so by opening next first, we ensure that they
+ // are distinct.
+ int next_dfd = openat(backup_dfd, "next/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+ if(next_dfd < 0 && errno != ENOENT) {
+ fprintf(stderr, "Failed to open $PGBAK/next/pg_wal: %m; will force new snapshot\n");
+ flags |= S_WANT_NEW_SNAPSHOT;
+ }
+ int cur_dfd = openat(backup_dfd, "current/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+ if(cur_dfd < 0) {
+ if(errno != ENOENT) {
+ fprintf(stderr, "Failed to open directory $PGBAK/current/pg_wal: %m\n");
+ return 1;
+ }
+ fprintf(stderr, "Directory $PGBAK/current/pg_wal does not exist; will request new snapshot\n");
+ if(next_dfd < 0) {
+ flags = S_WANT_SNAPSHOT;
}
- fprintf(stderr, "Failed to open directory $PGBAK/current/pg_wal: %m\n");
- return 1;
+ }
+ if(cur_dfd < 0 && next_dfd < 0) {
+ fprintf(stderr, "Skipping %s\n", name);
+ goto skip;
}
char name_out[256];
memcpy(name_out, name + 7, name_len - 7);
memcpy(name_out + name_len - 7, WAL_EXT, strlen(WAL_EXT) + 1);
- if(faccessat(wal_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
- // Assume we've gone back in time
+ if(
+ (cur_dfd >= 0 && faccessat(cur_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) ||
+ (next_dfd >= 0 && faccessat(next_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+ ) {
+ // Assume we've gone back in WAL history
fprintf(stderr, "Backup of %s already exists; forcing snapshot\n", name);
- close(wal_dfd);
- // Prevent misdirected WALs or double snapshot
- (void) unlinkat(backup_dfd, "current", 0);
- flags = STATE_SNAPSHOT;
+ if(cur_dfd >= 0) close(cur_dfd);
+ if(next_dfd >= 0) close(next_dfd);
+ flags = S_WANT_NEW_SNAPSHOT;
goto skip;
}
- int out_fd = openat(wal_dfd, ".", O_WRONLY | O_TMPFILE | O_CLOEXEC, 0600);
+ int out_fd = openat(backup_dfd, ".", O_WRONLY | O_TMPFILE | O_CLOEXEC, 0600);
if(out_fd < 0) {
fprintf(stderr, "Failed to create WAL backup file at $PGBAK/current: %m\n");
goto fail_dir;
@@ -521,17 +596,25 @@ static int cmd_wal(const char *name) {
char fd_path[32];
sprintf(fd_path, "/proc/self/fd/%d", out_fd);
- if(linkat(AT_FDCWD, fd_path, wal_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) {
+ if(cur_dfd >= 0 && linkat(AT_FDCWD, fd_path, cur_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) {
+ fprintf(stderr, "Failed to link WAL backup: %m\n");
+ goto fail;
+ }
+ if(next_dfd >= 0 && linkat(AT_FDCWD, fd_path, next_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) {
fprintf(stderr, "Failed to link WAL backup: %m\n");
goto fail;
}
close(out_fd);
- if(fsync(wal_dfd) < 0) {
+ if(cur_dfd >= 0 && fsync(cur_dfd) < 0) {
+ fprintf(stderr, "fsync(): %m\n");
+ goto fail_dir;
+ }
+ if(next_dfd >= 0 && fsync(next_dfd) < 0) {
fprintf(stderr, "fsync(): %m\n");
goto fail_dir;
}
- close(wal_dfd);
+ close(cur_dfd);
skip:
flags = bak_begin(flags);
@@ -551,14 +634,15 @@ skip:
fail:
close(out_fd);
fail_dir:
- close(wal_dfd);
+ if(cur_dfd >= 0) close(cur_dfd);
+ if(next_dfd >= 0) close(next_dfd);
return 1;
}
static int cmd_sync(bool force_archive, bool force_snapshot) {
int flags = 0;
- if(force_archive) flags |= STATE_ARCHIVE;
- if(force_snapshot) flags |= STATE_SNAPSHOT;
+ if(force_archive) flags |= S_WANT_ARCHIVE;
+ if(force_snapshot) flags |= S_WANT_NEW_SNAPSHOT;
flags = bak_begin(flags);
if(flags < 0) {
return cmd_wait(-1);