diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..07115cd --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..893cce5 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/pg-recover.iml b/.idea/pg-recover.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/pg-recover.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE index 95dfe16..2ca9f92 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 garrettmills +Copyright (c) 2025 Garrett Mills Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/README.md b/README.md index 963a07c..cc1ece7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,47 @@ # pg-recover -A dirty, terrible, dangerous Postgres recovery tool. \ No newline at end of file +> ⚠️ **DO NOT DO THIS…** well ever really, but especially on a server with failing disks. +> This is done on a server with perfectly fine disks, but corrupted Postgres blocks. + +A dirty, terrible, dangerous Postgres recovery tool. + +This is designed to recover as much data as possible from a Postgres table with bad disk blocks +BUT good physical disks (say after an unclean exit, or you've snapshotted bad disks and mounted +them on a good server). It does this by creating a clean "recovery" table with the same schema +and reading rows from the bad table one at a time, skipping rows and blocks with bad disk errors. + +For more specifics on how this works technically, see my blog post. + +Requirements: + +- Bash + tools (cat, tac, cut, sed, &c.) +- Postgres client (`psql` and `pg_dump`) +- Table must have a `SERIAL` primary key and at least one other non-`NULL` column + +Usage: + +```text +USAGE: pg-recover.sh [=500] [] + +user - the Postgres user to connect with +host - the Postgres server host +database - the Postgres database +table - the Postgres table +primary key - the name of the SERIAL primary key column +nonnull col - the name of a DIFFERENT non-null column on the table +commit size - how many rows to recover before committing the transaction (default: 500) +start at - start at the specific primary key (descending) + +Copyright (c) 2025 Garrett Mills +https://code.garrettmills.dev/garrettmills/pg-recover +``` + +Once the script finishes, you can import the recovered data like so: + +```shell +psql [...] < pqr-final-attempt.sql +``` + +This will create a new table `_recovery` with the recovered data. + +License: See the `LICENSE` file. diff --git a/pg-recover.sh b/pg-recover.sh new file mode 100644 index 0000000..d09fa88 --- /dev/null +++ b/pg-recover.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# Make sure we have all the right parameters: + +PGR_USER="${1:-}" +PGR_HOST="${2:-}" +PGR_BASE="${3:-}" +PGR_TABLE="${4:-}" +PGR_PRIMARY_KEY="${5:-}" +PGR_NONNULL_COL="${6:-}" +PGR_COMMIT_SIZE="${7:-500}" +PGR_START_AT="${8:-}" + +PGR_SKIPS_SQL="" +PGR_START_AT_SQL="" +PASS_NO=0 + +if [ -z "$PGR_USER" ] || [ -z "$PGR_HOST" ] || [ -z "$PGR_BASE" ] || [ -z "$PGR_TABLE" ] || [ -z "$PGR_PRIMARY_KEY" ] || [ -z "$PGR_NONNULL_COL" ]; then + echo "USAGE: pg-recover.sh
[] []" + echo "" + echo "user - the Postgres user to connect with" + echo "host - the Postgres server host" + echo "database - the Postgres database" + echo "table - the Postgres table" + echo "primary key - the name of the SERIAL primary key column" + echo "nonnull col - the name of a DIFFERENT non-null column on the table" + echo "commit size - how many rows to recover before committing the transaction (default: 500)" + echo "start at - start at the specific primary key (descending)" + echo "" + echo "Copyright (c) 2025 Garrett Mills " + echo "https://code.garrettmills.dev/garrettmills/pg-recover" + exit 1 +fi + +if [ -n "$PGR_START_AT" ]; then + PGR_START_AT_SQL="cnt := ${PGR_START_AT};" +fi + + +# Drop and re-create the recovery table based on the original table's DDL +pgr_reset_recovery() { + echo "Resetting recovery table..." + + set -ex + psql -U "$PGR_USER" -h "$PGR_HOST" -c "drop table if exists ${PGR_TABLE}_recovery" "$PGR_BASE" + psql -U "$PGR_USER" -h "$PGR_HOST" -c "create table ${PGR_TABLE}_recovery (like ${PGR_TABLE})" "$PGR_BASE" + set +ex + echo "Done." +} + + +# Replace the placeholders in the pg-recover stored procedure and (re-)create it +pgr_populate_proc() { + set -e + rm -f pg-recover.sql.inst + cp "${SCRIPT_DIR}/pg-recover.sql" pg-recover.sql.inst + + sed -i "s/pgr_primary_key/${PGR_PRIMARY_KEY}/g" pg-recover.sql.inst + sed -i "s/pgr_table/${PGR_TABLE}/g" pg-recover.sql.inst + sed -i "s/pgr_nonnull_col/${PGR_NONNULL_COL}/g" pg-recover.sql.inst + sed -i "s/pgr_commit_size/${PGR_COMMIT_SIZE}/g" pg-recover.sql.inst + sed -i "s/pgr_start_at/${PGR_START_AT_SQL}/g" pg-recover.sql.inst + sed -i.bak -e "/pgr_skips/ {r /dev/stdin" -e "d}" pg-recover.sql.inst <<< "$PGR_SKIPS_SQL" + + psql -U "$PGR_USER" -h "$PGR_HOST" "$PGR_BASE" < pg-recover.sql.inst + set +e +} + + +# Wait for the Postgres server to come back online (e.g. after a crash) +pgr_wait_online() { + local TRIES + TRIES=100 + + while [[ "$TRIES" -gt 0 ]]; do + echo " - Waiting for Postgres server (tries: $TRIES)" + TRIES="$((TRIES - 1))" + if psql -U "$PGR_USER" -h "$PGR_HOST" -c "select 1;" "$PGR_BASE" > /dev/null 2>&1; then + return + fi + + sleep 10 + done + + echo "Postgres server did not come back online in time" + exit 1 +} + + +# (Recursive) Try to recover rows from the bad table to the recovery table. If the server crashes, +# then add a region of the primary key to skip and try again. Dumps the data from each attempt. +pgr_recovery_pass() { + PASS_NO="$((PASS_NO + 1))" + echo "Attempting recovery pass #${PASS_NO}..." + + pgr_reset_recovery + pgr_populate_proc + + psql -U "$PGR_USER" -h "$PGR_HOST" -c "call pg_recover_proc()" "$PGR_BASE" > pg-recover.log 2>&1 + + proc_stat="$?" + if [ "$proc_stat" != 0 ]; then + echo " - Recovery pass failed. Attempting to skip invalid primary key range." + + LAST_ID="$(tail -n 50 pg-recover.log | grep 'PGR_' | tac | head -n 1 | rev | cut -d' ' -f1 | rev)" + PREV_ID="$((LAST_ID - PGR_COMMIT_SIZE))" + NEXT_ID="$((LAST_ID + PGR_COMMIT_SIZE))" + echo " - LAST ID: $LAST_ID | BLOCK RANGE: $PREV_ID - $NEXT_ID" + +PGR_SKIPS_SQL=" +$PGR_SKIPS_SQL +IF cnt <= $NEXT_ID AND cnt > $PREV_ID THEN + cnt := $PREV_ID; +END IF; +" + + pgr_wait_online + + echo " - Dumping recovery data from last attempt" + pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-attempt-${PASS_NO}.sql" || echo " - WARN: Failed to dump recovery data" + + pgr_recovery_pass + fi +} + + +# Let 'er rip! +pgr_wait_online +pgr_recovery_pass + +echo "Exporting recovery data..." +pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-final-attempt.sql" || echo "ERROR: Failed to dump final recovery data" diff --git a/pg-recover.sql b/pg-recover.sql new file mode 100644 index 0000000..4950823 --- /dev/null +++ b/pg-recover.sql @@ -0,0 +1,42 @@ +CREATE OR REPLACE PROCEDURE pg_recover_proc() +LANGUAGE plpgsql AS $$ +DECLARE + cnt BIGINT := 0; +BEGIN + -- Get the maximum page_view_id from the page_views table + cnt := (SELECT MAX(pgr_primary_key) FROM pgr_table); + pgr_start_at + + -- Loop through the page_views table in reverse order by page_view_id + LOOP + pgr_skips + + BEGIN + -- Insert the row with the current page_view_id into page_views2 + INSERT INTO pgr_table_recovery + SELECT * FROM pgr_table WHERE pgr_primary_key = cnt and pgr_nonnull_col is not null; + + -- Decrement the counter + cnt := cnt - 1; + + -- Exit the loop when cnt < 1 + EXIT WHEN cnt < 1; + EXCEPTION + WHEN OTHERS THEN + -- Handle exceptions (e.g., data corruption) + IF POSITION('block' in SQLERRM) > 0 OR POSITION('status of transaction' in SQLERRM) > 0 OR POSITION('memory alloc' in SQLERRM) > 0 OR POSITION('data is corrupt' in SQLERRM) > 0 OR POSITION('MultiXactId' in SQLERRM) > 0 THEN + RAISE WARNING 'PGR_SKIP: %', cnt; + cnt := cnt - 1; + CONTINUE; + ELSE + RAISE; + END IF; + END; + + IF MOD(cnt, pgr_commit_size) = 0 THEN + RAISE WARNING 'PGR_COMMIT: %', cnt; + COMMIT; + END IF; + END LOOP; +END; +$$;