Add script

2025-01-11 16:20:07 -05:00 · 2025-01-11 16:20:07 -05:00 · d8455cf6a0
commit d8455cf6a0
parent 6b862196f8
9 changed files with 259 additions and 2 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/pg-recover.iml" filepath="$PROJECT_DIR$/.idea/pg-recover.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/pg-recover.iml
+++ b/.idea/pg-recover.iml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2025 garrettmills
+Copyright (c) 2025 Garrett Mills <shout@garrettmills.dev>

 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

--- a/README.md
+++ b/README.md
@ -1,3 +1,47 @@
 # pg-recover

+> ⚠️ **DO NOT DO THIS…** well ever really, but especially on a server with failing disks.
+> This is done on a server with perfectly fine disks, but corrupted Postgres blocks.
+
 A dirty, terrible, dangerous Postgres recovery tool.
+
+This is designed to recover as much data as possible from a Postgres table with bad disk blocks
+BUT good physical disks (say after an unclean exit, or you've snapshotted bad disks and mounted
+them on a good server). It does this by creating a clean "recovery" table with the same schema
+and reading rows from the bad table one at a time, skipping rows and blocks with bad disk errors.
+
+For more specifics on how this works technically, see my blog post.
+
+Requirements:
+
+- Bash + tools (cat, tac, cut, sed, &c.)
+- Postgres client (`psql` and `pg_dump`)
+- Table must have a `SERIAL` primary key and at least one other non-`NULL` column
+
+Usage:
+
+```text
+USAGE: pg-recover.sh <user> <host> <database> <table> <primary key> <nonnull col> [<commit size>=500] [<start at>]
+
+user - the Postgres user to connect with
+host - the Postgres server host
+database - the Postgres database
+table - the Postgres table
+primary key - the name of the SERIAL primary key column
+nonnull col - the name of a DIFFERENT non-null column on the table
+commit size - how many rows to recover before committing the transaction (default: 500)
+start at - start at the specific primary key (descending)
+
+Copyright (c) 2025 Garrett Mills <shout@garrettmills.dev>
+https://code.garrettmills.dev/garrettmills/pg-recover
+```
+
+Once the script finishes, you can import the recovered data like so:
+
+```shell
+psql [...] < pqr-final-attempt.sql
+```
+
+This will create a new table `<original table>_recovery` with the recovered data.
+
+License: See the `LICENSE` file.
--- a/pg-recover.sh
+++ b/pg-recover.sh
@ -0,0 +1,134 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+# Make sure we have all the right parameters:
+
+PGR_USER="${1:-}"
+PGR_HOST="${2:-}"
+PGR_BASE="${3:-}"
+PGR_TABLE="${4:-}"
+PGR_PRIMARY_KEY="${5:-}"
+PGR_NONNULL_COL="${6:-}"
+PGR_COMMIT_SIZE="${7:-500}"
+PGR_START_AT="${8:-}"
+
+PGR_SKIPS_SQL=""
+PGR_START_AT_SQL=""
+PASS_NO=0
+
+if [ -z "$PGR_USER" ] || [ -z "$PGR_HOST" ] || [ -z "$PGR_BASE" ] || [ -z "$PGR_TABLE" ] || [ -z "$PGR_PRIMARY_KEY" ] || [ -z "$PGR_NONNULL_COL" ]; then
+	echo "USAGE: pg-recover.sh <user> <host> <database> <table> <primary key> <nonnull col> [<commit size=500>] [<start at>]"
+	echo ""
+	echo "user - the Postgres user to connect with"
+	echo "host - the Postgres server host"
+	echo "database - the Postgres database"
+	echo "table - the Postgres table"
+	echo "primary key - the name of the SERIAL primary key column"
+	echo "nonnull col - the name of a DIFFERENT non-null column on the table"
+	echo "commit size - how many rows to recover before committing the transaction (default: 500)"
+	echo "start at - start at the specific primary key (descending)"
+	echo ""
+	echo "Copyright (c) 2025 Garrett Mills <shout@garrettmills.dev>"
+	echo "https://code.garrettmills.dev/garrettmills/pg-recover"
+	exit 1
+fi
+
+if [ -n "$PGR_START_AT" ]; then
+	PGR_START_AT_SQL="cnt := ${PGR_START_AT};"
+fi
+
+
+# Drop and re-create the recovery table based on the original table's DDL
+pgr_reset_recovery() {
+	echo "Resetting recovery table..."
+
+	set -ex
+	psql -U "$PGR_USER" -h "$PGR_HOST" -c "drop table if exists ${PGR_TABLE}_recovery" "$PGR_BASE"
+	psql -U "$PGR_USER" -h "$PGR_HOST" -c "create table ${PGR_TABLE}_recovery (like ${PGR_TABLE})" "$PGR_BASE"
+	set +ex
+	echo "Done."
+}
+
+
+# Replace the placeholders in the pg-recover stored procedure and (re-)create it
+pgr_populate_proc() {
+	set -e
+	rm -f pg-recover.sql.inst
+	cp "${SCRIPT_DIR}/pg-recover.sql" pg-recover.sql.inst
+
+	sed -i "s/pgr_primary_key/${PGR_PRIMARY_KEY}/g" pg-recover.sql.inst
+	sed -i "s/pgr_table/${PGR_TABLE}/g" pg-recover.sql.inst
+	sed -i "s/pgr_nonnull_col/${PGR_NONNULL_COL}/g" pg-recover.sql.inst
+	sed -i "s/pgr_commit_size/${PGR_COMMIT_SIZE}/g" pg-recover.sql.inst
+	sed -i "s/pgr_start_at/${PGR_START_AT_SQL}/g" pg-recover.sql.inst
+	sed -i.bak -e "/pgr_skips/ {r /dev/stdin" -e "d}" pg-recover.sql.inst <<< "$PGR_SKIPS_SQL"
+
+	psql -U "$PGR_USER" -h "$PGR_HOST" "$PGR_BASE" < pg-recover.sql.inst
+	set +e
+}
+
+
+# Wait for the Postgres server to come back online (e.g. after a crash)
+pgr_wait_online() {
+	local TRIES
+	TRIES=100
+
+	while [[ "$TRIES" -gt 0 ]]; do
+		echo "   - Waiting for Postgres server (tries: $TRIES)"
+		TRIES="$((TRIES - 1))"
+		if psql -U "$PGR_USER" -h "$PGR_HOST" -c "select 1;" "$PGR_BASE" > /dev/null 2>&1; then
+			return
+		fi
+
+		sleep 10
+	done
+
+	echo "Postgres server did not come back online in time"
+	exit 1
+}
+
+
+# (Recursive) Try to recover rows from the bad table to the recovery table. If the server crashes,
+# then add a region of the primary key to skip and try again. Dumps the data from each attempt.
+pgr_recovery_pass() {
+	PASS_NO="$((PASS_NO + 1))"
+	echo "Attempting recovery pass #${PASS_NO}..."
+
+	pgr_reset_recovery
+	pgr_populate_proc
+
+	psql -U "$PGR_USER" -h "$PGR_HOST" -c "call pg_recover_proc()" "$PGR_BASE" > pg-recover.log 2>&1
+
+	proc_stat="$?"
+	if [ "$proc_stat" != 0 ]; then
+		echo "   - Recovery pass failed. Attempting to skip invalid primary key range."
+
+		LAST_ID="$(tail -n 50 pg-recover.log | grep 'PGR_' | tac | head -n 1 | rev | cut -d' ' -f1 | rev)"
+		PREV_ID="$((LAST_ID - PGR_COMMIT_SIZE))"
+		NEXT_ID="$((LAST_ID + PGR_COMMIT_SIZE))"
+		echo "   - LAST ID: $LAST_ID   |   BLOCK RANGE: $PREV_ID - $NEXT_ID"
+
+PGR_SKIPS_SQL="
+$PGR_SKIPS_SQL
+IF cnt <= $NEXT_ID AND cnt > $PREV_ID THEN
+	cnt := $PREV_ID;
+END IF;
+"
+
+		pgr_wait_online
+
+		echo "   - Dumping recovery data from last attempt"
+		pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-attempt-${PASS_NO}.sql" || echo "     - WARN: Failed to dump recovery data"
+
+		pgr_recovery_pass
+	fi
+}
+
+
+# Let 'er rip!
+pgr_wait_online
+pgr_recovery_pass
+
+echo "Exporting recovery data..."
+pg_dump -U "$PGR_USER" -h "$PGR_HOST" --table "${PGR_TABLE}_recovery" "$PGR_BASE" > "pgr-final-attempt.sql" || echo "ERROR: Failed to dump final recovery data"
--- a/pg-recover.sql
+++ b/pg-recover.sql
@ -0,0 +1,42 @@
+CREATE OR REPLACE PROCEDURE pg_recover_proc()
+LANGUAGE plpgsql AS $$
+DECLARE
+	cnt BIGINT := 0;
+BEGIN
+	-- Get the maximum page_view_id from the page_views table
+	cnt := (SELECT MAX(pgr_primary_key) FROM pgr_table);
+	pgr_start_at
+
+	-- Loop through the page_views table in reverse order by page_view_id
+	LOOP
+		pgr_skips
+
+		BEGIN
+			-- Insert the row with the current page_view_id into page_views2
+			INSERT INTO pgr_table_recovery
+			SELECT * FROM pgr_table WHERE pgr_primary_key = cnt and pgr_nonnull_col is not null;
+
+			-- Decrement the counter
+			cnt := cnt - 1;
+
+			-- Exit the loop when cnt < 1
+			EXIT WHEN cnt < 1;
+		EXCEPTION
+			WHEN OTHERS THEN
+				-- Handle exceptions (e.g., data corruption)
+				IF POSITION('block' in SQLERRM) > 0 OR POSITION('status of transaction' in SQLERRM) > 0 OR POSITION('memory alloc' in SQLERRM) > 0 OR POSITION('data is corrupt' in SQLERRM) > 0 OR POSITION('MultiXactId' in SQLERRM) > 0 THEN
+					RAISE WARNING 'PGR_SKIP: %', cnt;
+					cnt := cnt - 1;
+					CONTINUE;
+				ELSE
+					RAISE;
+				END IF;
+			END;
+
+		IF MOD(cnt, pgr_commit_size) = 0 THEN
+			RAISE WARNING 'PGR_COMMIT: %', cnt;
+			COMMIT;
+		END IF;
+	END LOOP;
+END;
+$$;