From 01fabfa25f92424e35ecb4ae98e95a06aa2e278f Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Tue, 17 Oct 2023 23:40:52 +0300 Subject: Improve backup consistency during snapshot We now create the snapshot as "next" and only promote it to "current" once it finishes. WAL files are linked to both current and next. --- pgbak.c | 288 +++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 186 insertions(+), 102 deletions(-) (limited to 'pgbak.c') diff --git a/pgbak.c b/pgbak.c index 8fa265f..876fff9 100644 --- a/pgbak.c +++ b/pgbak.c @@ -48,13 +48,25 @@ static const char *const LOCK_KIND[] = { // ===== State flags ===== -#define STATE_SHIFT 4 // low=pending, high=running -#define STATE_MASK 0x0f +#define STATE_MASK 0x7 // We need to run the backup script. -#define STATE_ARCHIVE 0x01 -// A new full snapshot needs to be made. -#define STATE_SNAPSHOT 0x02 +#define S_WANT_ARCHIVE 0x08 +// A "next" full snapshot should be made unless one is in progress. +#define S_WANT_SNAPSHOT 0x10 +// A "next" full snapshot needs to be made. +#define S_WANT_NEW_SNAPSHOT 0x20 + +// Done. +#define STATE_IDLE 0x0 +// Run the backup script, goto STATE_IDLE +#define STATE_ARCHIVE 0x1 +// Rename next->current, goto STATE_ARCHIVE +#define STATE_PROMOTE 0x2 +// Run the backup script, goto STATE_PROMOTE +#define STATE_PREARCHIVE 0x3 +// Take a new snapshot, goto STATE_PREARCHIVE +#define STATE_SNAPSHOT 0x4 static void lck_release(int pos) { @@ -196,15 +208,6 @@ static int cld_pipe(int in_fd, int out_fd, const char *const *argv) { static int bak_snapshot(char *name) { - { - struct timespec ts; - if(clock_gettime(CLOCK_REALTIME, &ts) < 0) { - fprintf(stderr, "clock_gettime(CLOCK_REALTIME): %m\n"); - abort(); - } - sprintf(name, "%lld", (long long)ts.tv_sec); - } - if(mkdirat(backup_dfd, name, 0700) < 0) { fprintf(stderr, "Failed to create $PGBAK/%s: %m\n", name); return -1; @@ -228,9 +231,9 @@ static int bak_snapshot(char *name) { goto fail_clean; } - (void)unlinkat(backup_dfd, "current", 0); - if(symlinkat(name, backup_dfd, "current") < 0) { - fprintf(stderr, "Failed to update $PGBAK/current: %m\n"); + (void)unlinkat(backup_dfd, "next", 0); + if(symlinkat(name, backup_dfd, "next") < 0) { + fprintf(stderr, "Failed to update $PGBAK/next: %m\n"); goto fail; } @@ -266,9 +269,12 @@ static int bak_snapshot(char *name) { fprintf(stderr, "fsync(): %m\n"); abort(); } - return subdfd; + close(subdfd); + + return 0; fail: + (void)unlinkat(backup_dfd, "next", 0); close(out_fd); fail_clean: (void)unlinkat(subdfd, "pg_wal", AT_REMOVEDIR); @@ -283,11 +289,11 @@ static bool should_pgbasebackup(int subdfd) { struct stat st; if(fstatat(subdfd, BASE_OUT, &st, AT_SYMLINK_NOFOLLOW) < 0) { - fprintf(stderr, "Failed to stat %s: %m\n", BASE_OUT); + fprintf(stderr, "Failed to stat %s: %m; creating new snapshot\n", BASE_OUT); return true; } if(!S_ISREG(st.st_mode)) { - fprintf(stderr, "Expected regular file at %s\n", BASE_OUT); + fprintf(stderr, "Expected regular file at %s; creating new snapshot\n", BASE_OUT); return true; } @@ -297,7 +303,7 @@ static bool should_pgbasebackup(int subdfd) { int wal_dfd = openat(subdfd, "pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC); if(wal_dfd < 0) { - fprintf(stderr, "Failed to open $PGBAK/current/pg_wal: %m\n"); + fprintf(stderr, "Failed to open $PGBAK/current/pg_wal: %m; creating new snapshot\n"); return true; } @@ -323,90 +329,135 @@ static bool should_pgbasebackup(int subdfd) { } closedir(wal_dir); - return wal_count > 8 && (wal_size >> 1) >= base_size; + if(wal_count > 8 && (wal_size >> 1) >= base_size) { + fprintf(stderr, "WAL directroy too big; creating new snapshot\n"); + return true; + } + return false; +} + +static int bak_open_current(char *name, size_t namebuf) { + ssize_t r = readlinkat(backup_dfd, "current", name, namebuf); + if(r < 0) { + if(errno != ENOENT) { + fprintf(stderr, "Failed to readlink() $PGBAK/current: %m\n"); + return -1; + } + fprintf(stderr, "Failed to open() $PGBAK/current: %m\n"); + return -1; + } + if((size_t)r >= namebuf) { + fprintf(stderr, "Symlink $PGBAK/current target too long\n"); + return -1; + } + name[r] = 0; + int subdfd = openat(backup_dfd, name, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if(subdfd < 0) { + fprintf(stderr, "Failed to open() $PGBAK/%s: %m\n", name); + } + return subdfd; } // Archiver. Called in a subprocess with LOCK_STATE. static void bak_work(int state) { unsigned int backoff = 4 * 512; - char name[32]; - char ts_s[32]; - const char *const cmd[] = { - "../scripts/backup", name, ts_s, NULL, + char cur_name[32]; + char ts_name[32]; + const char *archive_cmd[] = { + "../scripts/backup", cur_name, ts_name, NULL, NULL, }; lck_wait(LOCK_WAIT); bool did_snapshot = false; - int subdfd = -1; bool refresh_ts = true; + int flags = state; + state &= STATE_MASK; // Skip reopening if a full snapshot was requested. while(true) { - if((state & STATE_ARCHIVE) && subdfd < 0) { - ssize_t r = readlinkat(backup_dfd, "current", name, sizeof(name)); - if(r < 0) { - if(errno != ENOENT) { - fprintf(stderr, "Failed to readlink() $PGBAK/current: %m\n"); - _Exit(1); - } + switch(state) { + case STATE_IDLE: + if(flags & S_WANT_ARCHIVE) state = STATE_ARCHIVE; + [[fallthrough]]; + case STATE_ARCHIVE: + if(flags & S_WANT_SNAPSHOT) state = STATE_SNAPSHOT; + [[fallthrough]]; + case STATE_PROMOTE: + case STATE_PREARCHIVE: + if(flags & S_WANT_NEW_SNAPSHOT) state = STATE_SNAPSHOT; + break; + default: state = STATE_SNAPSHOT; - } else { - if((size_t)r >= sizeof(name)) { - fprintf(stderr, "Symlink $PGBAK/current target too long\n"); - _Exit(1); - } - name[r] = 0; - subdfd = openat(backup_dfd, name, O_RDONLY | O_DIRECTORY | O_CLOEXEC); - if(subdfd < 0) { - fprintf(stderr, "Failed to open() $PGBAK/%s: %m\n", name); - state = STATE_SNAPSHOT; - } - } } + lck_write(state); + lck_release(LOCK_STATE); + + if(state == STATE_IDLE) break; - if((state & STATE_ARCHIVE) && refresh_ts) { + if(state == STATE_SNAPSHOT || refresh_ts) { refresh_ts = false; struct timespec ts; if(clock_gettime(CLOCK_REALTIME, &ts) < 0) { fprintf(stderr, "clock_gettime(CLOCK_REALTIME): %m\n"); abort(); } - sprintf(ts_s, "%lld", (long long)ts.tv_sec); + sprintf(ts_name, "%lld", (long long)ts.tv_sec); } - lck_write(state << STATE_SHIFT); - lck_release(LOCK_STATE); - const char *failed = NULL; - if(state & STATE_ARCHIVE) { - pid_t child = cld_spawn(-1, -1, subdfd, cmd); - if(cld_wait(child, cmd[0])) { - failed = "backup script"; - } else { - fprintf(stderr, "Finished archive %s-%s\n", name, ts_s); - } - if(!(state & STATE_SNAPSHOT) && !did_snapshot && should_pgbasebackup(subdfd)) { - state |= STATE_SNAPSHOT; - } - if(!failed) { - state &= ~STATE_ARCHIVE; - } - } else if(state & STATE_SNAPSHOT) { - if(subdfd >= 0) { - close(subdfd); - } - subdfd = bak_snapshot(name); - if(subdfd < 0) { - failed = "pg_basebackup"; - } else { - fprintf(stderr, "Finished snapshot %s\n", name); + switch(state) { + case STATE_SNAPSHOT: + if(bak_snapshot(ts_name) < 0) { + failed = "pg_basebackup"; + break; + } + fprintf(stderr, "Finished snapshot %s\n", ts_name); did_snapshot = true; - refresh_ts = true; + state = STATE_PREARCHIVE; + break; + case STATE_PROMOTE: + if(renameat(backup_dfd, "next", backup_dfd, "current") < 0) { + fprintf(stderr, "Failed to promote $PGBAK/next to current: %m\n"); + state = STATE_SNAPSHOT; + break; + } + if(fsync(backup_dfd) < 0) { + fprintf(stderr, "fsync(): %m\n"); + abort(); + } state = STATE_ARCHIVE; + break; + case STATE_ARCHIVE: + case STATE_PREARCHIVE: + { + int subdfd = bak_open_current(cur_name, sizeof(cur_name)); + if(subdfd < 0) { + if(state == STATE_ARCHIVE) { + state = STATE_SNAPSHOT; + } else { + state = STATE_PROMOTE; + } + break; + } + archive_cmd[3] = (state == STATE_PREARCHIVE) ? "old" : NULL; + pid_t child = cld_spawn(-1, -1, subdfd, archive_cmd); + if(cld_wait(child, archive_cmd[0])) { + close(subdfd); + failed = "backup script"; + break; + } + fprintf(stderr, "Finished archive %s-%s\n", cur_name, ts_name); + if(state == STATE_PREARCHIVE) { + state = STATE_PROMOTE; + } else if(!did_snapshot && should_pgbasebackup(subdfd)) { + state = STATE_SNAPSHOT; + } else { + state = STATE_IDLE; + } + close(subdfd); + break; } - } else { - break; } if(failed) { @@ -427,9 +478,7 @@ static void bak_work(int state) { } lck_wait(LOCK_STATE); - int nstate = lck_read(); - state |= nstate & STATE_MASK; - if(nstate & STATE_ARCHIVE) refresh_ts = true; + flags = lck_read(); } } @@ -439,16 +488,27 @@ static int bak_begin(int flags) { lck_wait(LOCK_STATE); int state = lck_read(); + state |= flags; if(!lck_try(LOCK_BACKUP)) { if(flags & ~state) { - lck_write(state | flags); + lck_write(state); } + lck_release(LOCK_STATE); return -1; } // We got the backup lock. If anything was running, it must have crashed. // Mark it as pending. - return flags | (state & STATE_MASK) | (state >> STATE_SHIFT); + switch(state & STATE_MASK) { + case STATE_SNAPSHOT: + fprintf(stderr, "Resuming interrupted snapshot\n"); + break; + case STATE_PROMOTE: + case STATE_ARCHIVE: + fprintf(stderr, "Resuming interrupted backup\n"); + break; + } + return state; } static void on_sigalrm(int) { @@ -475,32 +535,47 @@ static int cmd_wal(const char *name) { return 1; } - int flags = STATE_ARCHIVE; - int wal_dfd = openat(backup_dfd, "current/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC); - if(wal_dfd < 0) { - if(errno == ENOENT) { - fprintf(stderr, "Directory $PGBAK/current/pg_wal does not exist; skipping %s and doing snapshot\n", name); - goto skip; + int flags = S_WANT_ARCHIVE; + // current is renamed to next, so by opening next first, we ensure that they + // are distinct. + int next_dfd = openat(backup_dfd, "next/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if(next_dfd < 0 && errno != ENOENT) { + fprintf(stderr, "Failed to open $PGBAK/next/pg_wal: %m; will force new snapshot\n"); + flags |= S_WANT_NEW_SNAPSHOT; + } + int cur_dfd = openat(backup_dfd, "current/pg_wal", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if(cur_dfd < 0) { + if(errno != ENOENT) { + fprintf(stderr, "Failed to open directory $PGBAK/current/pg_wal: %m\n"); + return 1; + } + fprintf(stderr, "Directory $PGBAK/current/pg_wal does not exist; will request new snapshot\n"); + if(next_dfd < 0) { + flags = S_WANT_SNAPSHOT; } - fprintf(stderr, "Failed to open directory $PGBAK/current/pg_wal: %m\n"); - return 1; + } + if(cur_dfd < 0 && next_dfd < 0) { + fprintf(stderr, "Skipping %s\n", name); + goto skip; } char name_out[256]; memcpy(name_out, name + 7, name_len - 7); memcpy(name_out + name_len - 7, WAL_EXT, strlen(WAL_EXT) + 1); - if(faccessat(wal_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) { - // Assume we've gone back in time + if( + (cur_dfd >= 0 && faccessat(cur_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) || + (next_dfd >= 0 && faccessat(next_dfd, name_out, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) + ) { + // Assume we've gone back in WAL history fprintf(stderr, "Backup of %s already exists; forcing snapshot\n", name); - close(wal_dfd); - // Prevent misdirected WALs or double snapshot - (void) unlinkat(backup_dfd, "current", 0); - flags = STATE_SNAPSHOT; + if(cur_dfd >= 0) close(cur_dfd); + if(next_dfd >= 0) close(next_dfd); + flags = S_WANT_NEW_SNAPSHOT; goto skip; } - int out_fd = openat(wal_dfd, ".", O_WRONLY | O_TMPFILE | O_CLOEXEC, 0600); + int out_fd = openat(backup_dfd, ".", O_WRONLY | O_TMPFILE | O_CLOEXEC, 0600); if(out_fd < 0) { fprintf(stderr, "Failed to create WAL backup file at $PGBAK/current: %m\n"); goto fail_dir; @@ -521,17 +596,25 @@ static int cmd_wal(const char *name) { char fd_path[32]; sprintf(fd_path, "/proc/self/fd/%d", out_fd); - if(linkat(AT_FDCWD, fd_path, wal_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) { + if(cur_dfd >= 0 && linkat(AT_FDCWD, fd_path, cur_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) { + fprintf(stderr, "Failed to link WAL backup: %m\n"); + goto fail; + } + if(next_dfd >= 0 && linkat(AT_FDCWD, fd_path, next_dfd, name_out, AT_SYMLINK_FOLLOW) < 0) { fprintf(stderr, "Failed to link WAL backup: %m\n"); goto fail; } close(out_fd); - if(fsync(wal_dfd) < 0) { + if(cur_dfd >= 0 && fsync(cur_dfd) < 0) { + fprintf(stderr, "fsync(): %m\n"); + goto fail_dir; + } + if(next_dfd >= 0 && fsync(next_dfd) < 0) { fprintf(stderr, "fsync(): %m\n"); goto fail_dir; } - close(wal_dfd); + close(cur_dfd); skip: flags = bak_begin(flags); @@ -551,14 +634,15 @@ skip: fail: close(out_fd); fail_dir: - close(wal_dfd); + if(cur_dfd >= 0) close(cur_dfd); + if(next_dfd >= 0) close(next_dfd); return 1; } static int cmd_sync(bool force_archive, bool force_snapshot) { int flags = 0; - if(force_archive) flags |= STATE_ARCHIVE; - if(force_snapshot) flags |= STATE_SNAPSHOT; + if(force_archive) flags |= S_WANT_ARCHIVE; + if(force_snapshot) flags |= S_WANT_NEW_SNAPSHOT; flags = bak_begin(flags); if(flags < 0) { return cmd_wait(-1); -- cgit