#!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # Make sure we have all the right parameters: PGR_USER="${1:-}" PGR_HOST="${2:-}" PGR_BASE="${3:-}" PGR_TABLE="${4:-}" PGR_PRIMARY_KEY="${5:-}" PGR_NONNULL_COL="${6:-}" PGR_COMMIT_SIZE="${7:-500}" PGR_START_AT="${8:-}" PGR_SKIPS_SQL="" PGR_START_AT_SQL="" PASS_NO=0 if [ -z "$PGR_USER" ] || [ -z "$PGR_HOST" ] || [ -z "$PGR_BASE" ] || [ -z "$PGR_TABLE" ] || [ -z "$PGR_PRIMARY_KEY" ] || [ -z "$PGR_NONNULL_COL" ]; then echo "USAGE: pg-recover.sh [] []" echo "" echo "user - the Postgres user to connect with" echo "host - the Postgres server host" echo "database - the Postgres database" echo "table - the Postgres table" echo "primary key - the name of the SERIAL primary key column" echo "nonnull col - the name of a DIFFERENT non-null column on the table" echo "commit size - how many rows to recover before committing the transaction (default: 500)" echo "start at - start at the specific primary key (descending)" echo "" echo "Copyright (c) 2025 Garrett Mills " echo "https://code.garrettmills.dev/garrettmills/pg-recover" exit 1 fi if [ -n "$PGR_START_AT" ]; then PGR_START_AT_SQL="cnt := ${PGR_START_AT};" fi # Drop and re-create the recovery table based on the original table's DDL pgr_reset_recovery() { echo "Resetting recovery table..." set -ex psql -U "$PGR_USER" -h "$PGR_HOST" -c "drop table if exists ${PGR_TABLE}_recovery" "$PGR_BASE" psql -U "$PGR_USER" -h "$PGR_HOST" -c "create table ${PGR_TABLE}_recovery (like ${PGR_TABLE})" "$PGR_BASE" set +ex echo "Done." } # Replace the placeholders in the pg-recover stored procedure and (re-)create it pgr_populate_proc() { set -e rm -f pg-recover.sql.inst cp "${SCRIPT_DIR}/pg-recover.sql" pg-recover.sql.inst sed -i "s/pgr_primary_key/${PGR_PRIMARY_KEY}/g" pg-recover.sql.inst sed -i "s/pgr_table/${PGR_TABLE}/g" pg-recover.sql.inst sed -i "s/pgr_nonnull_col/${PGR_NONNULL_COL}/g" pg-recover.sql.inst sed -i "s/pgr_commit_size/${PGR_COMMIT_SIZE}/g" pg-recover.sql.inst sed -i "s/pgr_start_at/${PGR_START_AT_SQL}/g" pg-recover.sql.inst sed -i.bak -e "/pgr_skips/ {r /dev/stdin" -e "d}" pg-recover.sql.inst <<< "$PGR_SKIPS_SQL" psql -U "$PGR_USER" -h "$PGR_HOST" "$PGR_BASE" < pg-recover.sql.inst set +e } # Wait for the Postgres server to come back online (e.g. after a crash) pgr_wait_online() { local TRIES TRIES=100 while [[ "$TRIES" -gt 0 ]]; do echo " - Waiting for Postgres server (tries: $TRIES)" TRIES="$((TRIES - 1))" if psql -U "$PGR_USER" -h "$PGR_HOST" -c "select 1;" "$PGR_BASE" > /dev/null 2>&1; then return fi sleep 10 done echo "Postgres server did not come back online in time" exit 1 } # (Recursive) Try to recover rows from the bad table to the recovery table. If the server crashes, # then add a region of the primary key to skip and try again. Dumps the data from each attempt. pgr_recovery_pass() { PASS_NO="$((PASS_NO + 1))" echo "Attempting recovery pass #${PASS_NO}..." pgr_reset_recovery pgr_populate_proc psql -U "$PGR_USER" -h "$PGR_HOST" -c "call pg_recover_proc()" "$PGR_BASE" > pg-recover.log 2>&1 proc_stat="$?" if [ "$proc_stat" != 0 ]; then echo " - Recovery pass failed. Attempting to skip invalid primary key range." LAST_ID="$(tail -n 50 pg-recover.log | grep 'PGR_' | tac | head -n 1 | rev | cut -d' ' -f1 | rev)" PREV_ID="$((LAST_ID - PGR_COMMIT_SIZE))" NEXT_ID="$((LAST_ID + PGR_COMMIT_SIZE))" echo " - LAST ID: $LAST_ID | BLOCK RANGE: $PREV_ID - $NEXT_ID" PGR_SKIPS_SQL=" $PGR_SKIPS_SQL IF cnt <= $NEXT_ID AND cnt > $PREV_ID THEN cnt := $PREV_ID; END IF; " pgr_wait_online echo " - Dumping recovery data from last attempt" pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-attempt-${PASS_NO}.sql" || echo " - WARN: Failed to dump recovery data" pgr_recovery_pass fi } # Let 'er rip! pgr_wait_online pgr_recovery_pass echo "Exporting recovery data..." pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-final-attempt.sql" || echo "ERROR: Failed to dump final recovery data"