#!/bin/bash
#
# PostgreSQL OCF RA with advanced replication slot management
# Version: 1.7.5
# Release Date: 2026-04-29
#
# Description:  Manages PostgreSQL as an OCF HA resource with
#               physical replication, slot management, pg_rewind support,
#               automatic replication recovery, dynamic promoted node discovery,
#               automatic standby initialization, Pacemaker notify support,
#               replication slot creation before basebackup (prevents WAL recycling),
#               self-triggered resource cleanup after basebackup completion,
#               and seamless container mode support (Podman/Docker)
#

# Set PATH to include standard system directories (Pacemaker has minimal PATH)
export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:$PATH

# OCF instance parameters:
#   OCF_RESKEY_pgdata           - PostgreSQL data directory
#   OCF_RESKEY_pghost           - IP address to bind to
#   OCF_RESKEY_pgport           - PostgreSQL port
#   OCF_RESKEY_pguser           - PostgreSQL user for replication
#   OCF_RESKEY_application_name - Application name for replication (alphanumeric + underscore only)
#   OCF_RESKEY_slot_name        - Replication slot name
#   OCF_RESKEY_max_slot_wal_keep_size - Max replication slot size (MB)
#   OCF_RESKEY_monitor_timeout_promoted - Monitor timeout for promoted (seconds)
#   OCF_RESKEY_monitor_timeout_unpromoted - Monitor timeout for unpromoted (seconds)
#   OCF_RESKEY_rep_mode         - Replication mode (sync/async)
#   OCF_RESKEY_node_list        - Space-separated list of cluster nodes
#   OCF_RESKEY_backup_before_basebackup - Backup data before basebackup (true/false)
#   OCF_RESKEY_basebackup_timeout - Timeout for pg_basebackup operation (seconds)
#   OCF_RESKEY_pgpassfile       - Path to .pgpass file for replication credentials
#   OCF_RESKEY_replication_failure_threshold - Number of consecutive monitor cycles with
#                                               failed replication before triggering recovery (default: 5)
#   OCF_RESKEY_vip              - Virtual IP address (used to discover promoted node)
#   OCF_RESKEY_container_mode   - Enable container mode (true/false)
#   OCF_RESKEY_container_name   - Container name for PostgreSQL instance
#   OCF_RESKEY_container_image  - Container image to use (default: registry.opensuse.org/devel/bci/tumbleweed/containerfile/opensuse/postgres:17.6-158.5)
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs

# Helper function to run commands as PostgreSQL user
# Uses setpriv instead of runuser to avoid PAM session logging noise
run_as_pguser() {
    local pguser="${OCF_RESKEY_pguser:-postgres}"
    local pguid=$(id -u "$pguser" 2>/dev/null) || {
        ocf_log err "Cannot determine UID for user: $pguser"
        return 1
    }
    local pggid=$(id -g "$pguser" 2>/dev/null) || {
        ocf_log err "Cannot determine GID for user: $pguser"
        return 1
    }
    setpriv --reuid="$pguid" --regid="$pggid" --clear-groups "$@"
}

# Load container library (provides seamless container/bare-metal support)
if [ -f "${OCF_FUNCTIONS_DIR}/pgtwin-container-lib.sh" ]; then
    . "${OCF_FUNCTIONS_DIR}/pgtwin-container-lib.sh"
elif [ -f "/tmp/pgtwin-container-lib.sh" ]; then
    . "/tmp/pgtwin-container-lib.sh"
fi

# OCF return codes for promoted/master state
: ${OCF_RUNNING_PROMOTED:=8}
: ${OCF_RUNNING_MASTER:=8}

#######################################################################
# Defaults

OCF_RESKEY_pgdata_default="/var/lib/pgsql/data"
OCF_RESKEY_pghost_default=""
OCF_RESKEY_pgport_default="5432"
OCF_RESKEY_pguser_default="postgres"
OCF_RESKEY_application_name_default=""
OCF_RESKEY_slot_name_default="ha_slot"
OCF_RESKEY_max_slot_wal_keep_size_default="1024"
OCF_RESKEY_monitor_timeout_promoted_default="60"
OCF_RESKEY_monitor_timeout_unpromoted_default="30"
OCF_RESKEY_rep_mode_default="sync"
OCF_RESKEY_node_list_default=""
OCF_RESKEY_backup_before_basebackup_default="true"
OCF_RESKEY_basebackup_timeout_default="3600"
OCF_RESKEY_pgpassfile_default=""
OCF_RESKEY_replication_failure_threshold_default="5"
OCF_RESKEY_vip_default=""
OCF_RESKEY_container_mode_default="false"
OCF_RESKEY_pg_major_version_default=""
OCF_RESKEY_container_name_default="postgres-ha"
OCF_RESKEY_container_image_default="registry.opensuse.org/devel/bci/tumbleweed/containerfile/opensuse/postgres:17.6-158.5"

: ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}}
: ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}}
: ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}}
: ${OCF_RESKEY_pguser=${OCF_RESKEY_pguser_default}}
: ${OCF_RESKEY_application_name=${OCF_RESKEY_application_name_default}}
: ${OCF_RESKEY_slot_name=${OCF_RESKEY_slot_name_default}}
: ${OCF_RESKEY_max_slot_wal_keep_size=${OCF_RESKEY_max_slot_wal_keep_size_default}}
: ${OCF_RESKEY_monitor_timeout_promoted=${OCF_RESKEY_monitor_timeout_promoted_default}}
: ${OCF_RESKEY_monitor_timeout_unpromoted=${OCF_RESKEY_monitor_timeout_unpromoted_default}}
: ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}}
: ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}}
: ${OCF_RESKEY_backup_before_basebackup=${OCF_RESKEY_backup_before_basebackup_default}}
: ${OCF_RESKEY_basebackup_timeout=${OCF_RESKEY_basebackup_timeout_default}}
: ${OCF_RESKEY_pgpassfile=${OCF_RESKEY_pgpassfile_default}}
: ${OCF_RESKEY_replication_failure_threshold=${OCF_RESKEY_replication_failure_threshold_default}}
: ${OCF_RESKEY_vip=${OCF_RESKEY_vip_default}}
: ${OCF_RESKEY_container_mode=${OCF_RESKEY_container_mode_default}}
: ${OCF_RESKEY_pg_major_version=${OCF_RESKEY_pg_major_version_default}}
: ${OCF_RESKEY_container_name=${OCF_RESKEY_container_name_default}}
: ${OCF_RESKEY_container_image=${OCF_RESKEY_container_image_default}}

PGDATA="${OCF_RESKEY_pgdata}"
# PostgreSQL binaries - use pg_bindir parameter with fallback to /usr/bin
: ${OCF_RESKEY_pg_bindir:="/usr/bin"}
PGCTL="${OCF_RESKEY_pg_bindir}/pg_ctl"
PSQL="${OCF_RESKEY_pg_bindir}/psql"
PG_REWIND="${OCF_RESKEY_pg_bindir}/pg_rewind"
PG_BASEBACKUP="${OCF_RESKEY_pg_bindir}/pg_basebackup"
POSTGRES="${OCF_RESKEY_pg_bindir}/postgres"

#######################################################################

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="pgtwin" version="1.6.9">
<version>1.6.9</version>

<longdesc lang="en">
Resource agent for PostgreSQL with advanced replication slot management,
pg_rewind support, automatic failover handling, automatic replication recovery,
automatic standby initialization from empty PGDATA, and seamless container mode support (Podman/Docker).
Includes enhanced monitoring for replication health, dynamic promoted node discovery, and zero-touch deployment.
</longdesc>
<shortdesc lang="en">PostgreSQL HA with auto-initialization and container support</shortdesc>

<parameters>
<parameter name="pgdata" unique="0" required="1">
<longdesc lang="en">
Path to PostgreSQL data directory
</longdesc>
<shortdesc lang="en">PostgreSQL data directory</shortdesc>
<content type="string" default="${OCF_RESKEY_pgdata_default}" />
</parameter>

<parameter name="pghost" unique="0">
<longdesc lang="en">
IP address to bind PostgreSQL to
</longdesc>
<shortdesc lang="en">Bind IP address</shortdesc>
<content type="string" default="${OCF_RESKEY_pghost_default}" />
</parameter>

<parameter name="pgport" unique="0">
<longdesc lang="en">
PostgreSQL port number
</longdesc>
<shortdesc lang="en">PostgreSQL port</shortdesc>
<content type="integer" default="${OCF_RESKEY_pgport_default}" />
</parameter>

<parameter name="pguser" unique="0">
<longdesc lang="en">
PostgreSQL user for operations
</longdesc>
<shortdesc lang="en">PostgreSQL user</shortdesc>
<content type="string" default="${OCF_RESKEY_pguser_default}" />
</parameter>

<parameter name="application_name" unique="0">
<longdesc lang="en">
Application name for replication connections. Must contain only alphanumeric characters and underscores.
Hyphens (-) are not allowed. If empty, hostname will be used.
</longdesc>
<shortdesc lang="en">Application name</shortdesc>
<content type="string" default="${OCF_RESKEY_application_name_default}" />
</parameter>

<parameter name="slot_name" unique="0">
<longdesc lang="en">
Physical replication slot name
</longdesc>
<shortdesc lang="en">Replication slot name</shortdesc>
<content type="string" default="${OCF_RESKEY_slot_name_default}" />
</parameter>

<parameter name="max_slot_wal_keep_size" unique="0">
<longdesc lang="en">
Maximum size (in MB) for replication slot before automatic dropping.

When a replication slot exceeds this size, it indicates the standby has been
offline for a long time. The slot will be automatically dropped and the standby
will be recovered via pg_basebackup when it reconnects.

v1.6.9 Enhancement: Additional safety check compares slot size to database size.
If slot size exceeds database size, the slot will be dropped regardless of this
threshold, as pg_basebackup would be more efficient than replaying massive amounts
of WAL.

Values:
  0 = Disable automatic slot dropping (manual management required)
  &gt;0 = Drop slot if it exceeds this many MB OR exceeds database size

Default: 1024 (1 GB)

Examples:
  0     = Disabled (never drop slots automatically)
  512   = Drop if &gt; 512 MB
  1024  = Drop if &gt; 1 GB (default)
  10240 = Drop if &gt; 10 GB

Recommendation: For production, use default (1024) or higher. Set to 0 only
if you have monitoring and manual slot management procedures in place.
</longdesc>
<shortdesc lang="en">Max slot WAL size (MB), 0=disable</shortdesc>
<content type="integer" default="${OCF_RESKEY_max_slot_wal_keep_size_default}" />
</parameter>

<parameter name="monitor_timeout_promoted" unique="0">
<longdesc lang="en">
Monitor timeout for promoted instance (seconds)
</longdesc>
<shortdesc lang="en">Monitor timeout promoted</shortdesc>
<content type="integer" default="${OCF_RESKEY_monitor_timeout_promoted_default}" />
</parameter>

<parameter name="monitor_timeout_unpromoted" unique="0">
<longdesc lang="en">
Monitor timeout for unpromoted instance (seconds)
</longdesc>
<shortdesc lang="en">Monitor timeout unpromoted</shortdesc>
<content type="integer" default="${OCF_RESKEY_monitor_timeout_unpromoted_default}" />
</parameter>

<parameter name="rep_mode" unique="0">
<longdesc lang="en">
Replication mode: sync or async
</longdesc>
<shortdesc lang="en">Replication mode</shortdesc>
<content type="string" default="${OCF_RESKEY_rep_mode_default}" />
</parameter>

<parameter name="node_list" unique="0">
<longdesc lang="en">
Space-separated list of cluster node names
</longdesc>
<shortdesc lang="en">Cluster node list</shortdesc>
<content type="string" default="${OCF_RESKEY_node_list_default}" />
</parameter>

<parameter name="backup_before_basebackup" unique="0">
<longdesc lang="en">
Whether to backup existing data directory before pg_basebackup.

RECOMMENDED: true for production (safe but uses 2x disk space)

If true (DEFAULT - SAFE):
  - Moves existing data to timestamped backup directory (.backup.TIMESTAMP)
  - Full recovery possible if pg_basebackup fails
  - Requires double disk space temporarily (old backup + new basebackup)
  - Automatic cleanup of old backup directories after successful basebackup
  - Use for: Production environments, when data safety is critical

If false (RISKY - saves disk space):
  - Permanently deletes existing data directory before pg_basebackup
  - NO RECOVERY possible if pg_basebackup fails
  - Saves 50% disk space (only needs space for new basebackup)
  - Only use when:
    * External backup solution exists (Barman, pgBackRest, etc.)
    * Development/testing environment where data loss is acceptable
    * Disk space is critically limited AND you accept data loss risk

Default: true (prioritizes safety over disk space)
</longdesc>
<shortdesc lang="en">Backup before basebackup</shortdesc>
<content type="boolean" default="${OCF_RESKEY_backup_before_basebackup_default}" />
</parameter>

<parameter name="basebackup_timeout" unique="0">
<longdesc lang="en">
Timeout in seconds for pg_basebackup operation
</longdesc>
<shortdesc lang="en">Basebackup timeout (seconds)</shortdesc>
<content type="integer" default="${OCF_RESKEY_basebackup_timeout_default}" />
</parameter>

<parameter name="pgpassfile" unique="0">
<longdesc lang="en">
Path to .pgpass file for reading replication credentials.
Format: host:port:database:username:password
If empty, defaults to ~postgres/.pgpass
</longdesc>
<shortdesc lang="en">Path to .pgpass file</shortdesc>
<content type="string" default="${OCF_RESKEY_pgpassfile_default}" />
</parameter>

<parameter name="replication_failure_threshold" unique="0">
<longdesc lang="en">
Number of consecutive monitor cycles with failed/missing replication before
automatically triggering recovery (pg_rewind or pg_basebackup).

This prevents temporary network issues from triggering unnecessary recovery,
while ensuring persistent replication failures (like timeline divergence) are
automatically resolved.

Default: 5 (approximately 40 seconds with default 8s monitor interval for standby)
</longdesc>
<shortdesc lang="en">Replication failure threshold</shortdesc>
<content type="integer" default="${OCF_RESKEY_replication_failure_threshold_default}" />
</parameter>

<parameter name="vip" unique="0">
<longdesc lang="en">
Virtual IP address of the cluster (optional).
If specified, used to dynamically discover the current promoted node during demote operations.
If not specified, falls back to using node_list for promoted node discovery.
</longdesc>
<shortdesc lang="en">Virtual IP address</shortdesc>
<content type="string" default="${OCF_RESKEY_vip_default}" />
</parameter>

<parameter name="container_mode" unique="0">
<longdesc lang="en">
Enable container mode for PostgreSQL. When true, PostgreSQL runs in a Podman container
instead of directly on the host. This enables:
- Easy version management and upgrades
- Resource isolation (CPU, memory, I/O)
- Clean separation between multiple PostgreSQL versions
- Migration path between bare-metal and containerized deployments

Default: false (bare-metal PostgreSQL)
</longdesc>
<shortdesc lang="en">Enable container mode</shortdesc>
<content type="boolean" default="${OCF_RESKEY_container_mode_default}" />
</parameter>

<parameter name="pg_major_version" unique="0">
<longdesc lang="en">
PostgreSQL major version to use in container mode (e.g., "17", "16", "15").
When specified, pgtwin automatically discovers and uses the latest available
container image for that major version from the registry.

This provides:
- Automatic minor version updates (e.g., 17.6 → 17.7)
- No need to track specific image tags
- Always uses the latest stable release for the major version

Set to "auto" or leave empty to use container_image parameter instead.

Examples:
  pg_major_version="17"  → Uses latest PG 17 (e.g., 17.6-158.5)
  pg_major_version="16"  → Uses latest PG 16 (e.g., 16.10-300.5)

Default: "" (uses container_image parameter)
</longdesc>
<shortdesc lang="en">PostgreSQL major version</shortdesc>
<content type="string" default="${OCF_RESKEY_pg_major_version_default}" />
</parameter>

<parameter name="container_name" unique="0">
<longdesc lang="en">
Name for the PostgreSQL container. Used when container_mode=true.
Each node should have a unique container name if running multiple instances.

Default: postgres-ha
</longdesc>
<shortdesc lang="en">Container name</shortdesc>
<content type="string" default="${OCF_RESKEY_container_name_default}" />
</parameter>

<parameter name="container_image" unique="0">
<longdesc lang="en">
Container image to use for PostgreSQL when container_mode=true.
Uses openSUSE BCI (Base Container Images) by default.

Default: registry.opensuse.org/devel/bci/tumbleweed/containerfile/opensuse/postgres:17.6-158.5
</longdesc>
<shortdesc lang="en">Container image</shortdesc>
<content type="string" default="${OCF_RESKEY_container_image_default}" />
</parameter>

<parameter name="max_promotion_lag_bytes" unique="0">
<longdesc lang="en">
Maximum replication lag (in bytes) allowed before promoting a standby to primary.
Only applies when rep_mode=async. In sync mode, PostgreSQL's synchronous replication
guarantees the standby is caught up.

This prevents data loss during planned failover by blocking promotion of standbys
that are not caught up with the primary.

Values:
  -1 = Disable lag check (unsafe - may promote standby with significant lag)
   0 = Require fully caught up (no lag allowed)
  &gt;0 = Allow promotion if lag is less than this value in bytes

Examples:
  10485760    = 10 MB (default)
  104857600   = 100 MB
  1073741824  = 1 GB

Default: 10485760 (10 MB)
</longdesc>
<shortdesc lang="en">Max promotion lag (bytes)</shortdesc>
<content type="integer" default="${OCF_RESKEY_max_promotion_lag_bytes_default}" />
</parameter>

<parameter name="force_promotion" unique="0">
<longdesc lang="en">
Force promotion regardless of replication lag or sync state.

DANGER: Setting this to true bypasses all promotion safety checks!
This can result in data loss if the standby is not caught up with the primary.

Only use this in emergency scenarios where you need to force failover
despite the standby being out of sync.

For normal operations, leave this set to false and let the safety checks
prevent data loss.

Default: false
</longdesc>
<shortdesc lang="en">Force unsafe promotion</shortdesc>
<content type="boolean" default="${OCF_RESKEY_force_promotion_default}" />
</parameter>

</parameters>

<actions>
<action name="start"        timeout="120s" />
<action name="stop"         timeout="120s" />
<action name="status"       timeout="60s" />
<action name="monitor"      timeout="60s" interval="10s" depth="0" />
<action name="monitor"      timeout="60s" interval="8s" depth="0" role="Unpromoted" />
<action name="monitor"      timeout="60s" interval="3s" depth="0" role="Promoted" />
<action name="promote"      timeout="120s" />
<action name="demote"       timeout="120s" />
<action name="notify"       timeout="90s" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all" timeout="5s" />
</actions>
</resource-agent>
END
}

#######################################################################
# Functions

# Validate application name - only alphanumeric and underscore allowed
validate_application_name() {
    local app_name="$1"

    if [ -z "$app_name" ]; then
        return 0  # Empty is OK, will use hostname
    fi

    # Check for invalid characters (especially hyphens)
    if echo "$app_name" | grep -qE '[^a-zA-Z0-9_]'; then
        ocf_log err "Invalid application_name: '$app_name'. Only alphanumeric characters and underscores are allowed. Hyphens (-) are NOT permitted."
        return 1
    fi

    return 0
}

# Get application name for replication
# Sets global variable APPLICATION_NAME
get_application_name() {
    if [ -n "${OCF_RESKEY_application_name}" ]; then
        APPLICATION_NAME="${OCF_RESKEY_application_name}"
    else
        # Fallback to hostname, but sanitize it (replace hyphens with underscores)
        # Avoid command substitution by using temp file
        hostname -s > /tmp/.pgtwin_hostname.$$
        tr '-' '_' < /tmp/.pgtwin_hostname.$$ > /tmp/.pgtwin_appname.$$
        read APPLICATION_NAME < /tmp/.pgtwin_appname.$$
        rm -f /tmp/.pgtwin_hostname.$$ /tmp/.pgtwin_appname.$$
    fi
}

# Get safe synchronous_standby_names based on cluster topology and actual connections
# Returns comma-separated list of nodes that are both Unpromoted in cluster AND connected to PostgreSQL
# Falls back to '*' if no standbys meet both criteria (prevents write blocking)
get_safe_synchronous_standby_names() {
    # BUG FIX v1.6.18: Use dual-source validation (cluster state + PostgreSQL connections)
    # Prevents race conditions where cluster reports Unpromoted but PostgreSQL not connected yet

    # STEP 1: Get Unpromoted nodes from cluster (expected standbys)
    # Find the clone resource that contains this primitive resource
    local primitive_resource="${OCF_RESOURCE_INSTANCE%%:*}"
    local clone_resource=$(crm_mon --as-xml 2>/dev/null | \
        xmllint --xpath "string(//clone[resource/@id='${primitive_resource}']/@id)" - 2>/dev/null)

    if [ -z "$clone_resource" ]; then
        ocf_log warn "Could not determine clone resource name from primitive '${primitive_resource}'"
        ocf_log warn "Using '' (async) as fallback to prevent write blocking"
        echo ""
        return 0
    fi

    ocf_log debug "Primitive resource: ${primitive_resource}, Clone resource: ${clone_resource}"

    local expected_standbys=$(crm_mon --as-xml 2>/dev/null | \
        xmllint --xpath "//clone[@id='${clone_resource}']/resource[@role='Unpromoted']/node/@name" - 2>/dev/null | \
        sed 's/name="//g; s/"//g')

    if [ -z "$expected_standbys" ]; then
        ocf_log info "No Unpromoted nodes in cluster, using '' (async) to prevent write blocking"
        echo ""
        return 0
    fi

    # STEP 2: Get actually connected standbys from PostgreSQL
    # Query pg_stat_replication for connected application_names with state='streaming'
    local connected_standbys=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \
        \"SELECT application_name FROM pg_stat_replication WHERE state = 'streaming'\"" 2>/dev/null)

    if [ -z "$connected_standbys" ]; then
        ocf_log warn "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓"
        ocf_log warn "┃ WARNING: No standbys currently connected to PostgreSQL      ┃"
        ocf_log warn "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛"
        ocf_log warn "Expected standbys from cluster: $expected_standbys"
        ocf_log warn "Using synchronous_standby_names='' (async) to prevent write blocking"
        ocf_log warn "This will be updated when standbys establish streaming replication"
        echo ""
        return 0
    fi

    # STEP 3: Find intersection (nodes that are both expected AND connected)
    local safe_standbys=""
    local pending_standbys=""

    for expected in $expected_standbys; do
        # Strip domain from expected (FQDN -> hostname) for comparison
        # Example: "psql1.example.com" -> "psql1"
        local expected_short="${expected%%.*}"

        # Check if this expected standby is actually connected and streaming
        # Compare using short hostname since PostgreSQL application_name uses hostname not FQDN
        if echo "$connected_standbys" | grep -q "^${expected_short}$"; then
            if [ -z "$safe_standbys" ]; then
                safe_standbys="$expected_short"
            else
                safe_standbys="${safe_standbys}, ${expected_short}"
            fi
        else
            if [ -z "$pending_standbys" ]; then
                pending_standbys="$expected_short"
            else
                pending_standbys="$pending_standbys, $expected_short"
            fi
        fi
    done

    if [ -z "$safe_standbys" ]; then
        ocf_log warn "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓"
        ocf_log warn "┃ WARNING: application_name mismatch between cluster and PG  ┃"
        ocf_log warn "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛"
        ocf_log warn "Expected node names from cluster: $expected_standbys"
        ocf_log warn "application_names connected to PostgreSQL: $connected_standbys"
        ocf_log warn "Standbys ARE streaming but application_name does not match node name"
        ocf_log warn "Using synchronous_standby_names='*' to keep sync replication active"
        ocf_log warn "Fix: ensure primary_conninfo sets application_name=<nodename> on the standby"
        echo "*"
        return 0
    fi

    # Log status if some standbys are pending
    if [ -n "$pending_standbys" ]; then
        ocf_log info "Safe synchronous standbys (connected + streaming): $safe_standbys"
        ocf_log info "Pending standbys (not yet connected): $pending_standbys"
    else
        ocf_log info "Safe synchronous standbys (cluster + connected): $safe_standbys"
    fi

    echo "$safe_standbys"
}

# Update application_name in postgresql.auto.conf
# This ensures the correct application_name is set when starting or promoting
update_application_name_in_config() {
    # v1.7.0: Refactored to use ALTER SYSTEM instead of manual file editing
    # This function now handles both application_name and synchronous_standby_names
    # v1.6.18: synchronous_standby_names now uses cluster topology + connection state
    local app_name

    get_application_name
    app_name="$APPLICATION_NAME"

    if [ -z "$app_name" ]; then
        ocf_log warn "No application_name to set in configuration"
        return 0
    fi

    # Check if PostgreSQL is running (required for ALTER SYSTEM)
    if ! pgsql_is_running; then
        ocf_log debug "PostgreSQL not running, cannot use ALTER SYSTEM. Will configure on next start."
        return 0
    fi

    # Set application_name using ALTER SYSTEM
    ocf_log info "Configuring application_name='${app_name}' using ALTER SYSTEM"
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"ALTER SYSTEM SET application_name = '${app_name}'\"" >/dev/null 2>&1

    if [ $? -ne 0 ]; then
        ocf_log warn "Failed to set application_name using ALTER SYSTEM"
        return 1
    fi

    # BUG FIX v1.6.18: Configure synchronous_standby_names based on cluster topology + connection state
    # If this is a promoted instance with rep_mode=sync, configure synchronous_standby_names
    if pgsql_is_promoted && [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
        local current_sync=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW synchronous_standby_names\"" 2>/dev/null)

        # Check if admin has set advanced syntax (FIRST, ANY) - preserve it
        if echo "$current_sync" | grep -qE '^FIRST |^ANY '; then
            ocf_log info "synchronous_standby_names uses advanced syntax: '$current_sync'"
            ocf_log info "Preserving admin configuration (agent will not auto-update)"
            # Skip automatic management, admin controls this
        else
            # Get safe standby names based on cluster topology AND actual connections
            local safe_sync=$(get_safe_synchronous_standby_names)

            if [ "$current_sync" != "$safe_sync" ]; then
                ocf_log info "Updating synchronous_standby_names based on cluster topology and connection state"
                ocf_log info "Current: '$current_sync'"
                ocf_log info "New: '$safe_sync'"

                run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \
                    \"ALTER SYSTEM SET synchronous_standby_names = '${safe_sync}'\"" >/dev/null 2>&1

                if [ $? -eq 0 ]; then
                    ocf_log info "Synchronous replication configured: synchronous_standby_names='${safe_sync}'"
                else
                    ocf_log err "Failed to update synchronous_standby_names"
                    return 1
                fi
            else
                ocf_log debug "synchronous_standby_names already correct: '$safe_sync'"
            fi
        fi
    fi

    # Reload configuration to apply changes
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"SELECT pg_reload_conf()\"" >/dev/null 2>&1

    if [ $? -eq 0 ]; then
        ocf_log debug "PostgreSQL configuration reloaded successfully"
    else
        ocf_log warn "Failed to reload PostgreSQL configuration"
    fi

    return 0
}

# Get the Pacemaker cluster node name for this node
# Returns the cluster node name (not hostname) which is required for crm_attribute and node list operations
get_cluster_node_name() {
    # Use crm_node to get the cluster node name (not hostname)
    # This is critical when cluster node names differ from system hostnames
    crm_node -n 2>/dev/null || hostname -s
}

#######################################################################
# Container Mode Functions
#
# These functions provide podman container support for PostgreSQL.
# When container_mode=true, all PostgreSQL operations are executed
# inside containers instead of directly on the host.
#######################################################################

# Check if container exists
container_exists() {
    podman container exists "${OCF_RESKEY_container_name}" 2>/dev/null
    return $?
}

# Check if container is running
container_is_running() {
    local state=$(podman inspect --format '{{.State.Running}}' "${OCF_RESKEY_container_name}" 2>/dev/null)
    [ "$state" = "true" ]
    return $?
}

# Create PostgreSQL container (does not start it)
container_create() {
    if container_exists; then
        ocf_log debug "Container ${OCF_RESKEY_container_name} already exists"
        return 0
    fi

    ocf_log info "Creating PostgreSQL container: ${OCF_RESKEY_container_name}"

    # Pull image if not present
    if ! podman image exists "${OCF_RESKEY_container_image}"; then
        ocf_log info "Pulling container image: ${OCF_RESKEY_container_image}"
        podman pull "${OCF_RESKEY_container_image}" || {
            ocf_log err "Failed to pull container image: ${OCF_RESKEY_container_image}"
            return $OCF_ERR_GENERIC
        }
    fi

    # Create container with persistent storage
    # Key design: Use --network host for HA cluster networking
    # Data directory is bind-mounted from host
    podman create \
        --name "${OCF_RESKEY_container_name}" \
        --network host \
        --privileged \
        -v "${PGDATA}:/var/lib/pgsql/data:Z" \
        -e PGDATA="/var/lib/pgsql/data" \
        "${OCF_RESKEY_container_image}" \
        tail -f /dev/null

    if [ $? -eq 0 ]; then
        ocf_log info "Container ${OCF_RESKEY_container_name} created successfully"
        return 0
    else
        ocf_log err "Failed to create container ${OCF_RESKEY_container_name}"
        return $OCF_ERR_GENERIC
    fi
}

# Start container (must already exist)
container_start() {
    if container_is_running; then
        ocf_log debug "Container ${OCF_RESKEY_container_name} is already running"
        return 0
    fi

    ocf_log info "Starting container: ${OCF_RESKEY_container_name}"
    podman start "${OCF_RESKEY_container_name}" || {
        ocf_log err "Failed to start container ${OCF_RESKEY_container_name}"
        return $OCF_ERR_GENERIC
    }

    return 0
}

# Stop container
container_stop() {
    if ! container_is_running; then
        ocf_log debug "Container ${OCF_RESKEY_container_name} is not running"
        return 0
    fi

    ocf_log info "Stopping container: ${OCF_RESKEY_container_name}"
    podman stop -t 60 "${OCF_RESKEY_container_name}" || {
        ocf_log warn "Failed to stop container gracefully, forcing..."
        podman kill "${OCF_RESKEY_container_name}"
    }

    return 0
}

# Remove container
container_remove() {
    if ! container_exists; then
        return 0
    fi

    ocf_log info "Removing container: ${OCF_RESKEY_container_name}"
    podman rm -f "${OCF_RESKEY_container_name}" >/dev/null 2>&1
    return 0
}

# Execute command in container
# Usage: container_exec [command...]
container_exec() {
    if ! container_is_running; then
        ocf_log err "Cannot execute command: container ${OCF_RESKEY_container_name} is not running"
        return $OCF_ERR_GENERIC
    fi

    podman exec "${OCF_RESKEY_container_name}" "$@"
    return $?
}

# Execute command as postgres user in container
# Usage: container_exec_as_postgres [command...]
container_exec_as_postgres() {
    container_exec runuser -u postgres -- "$@"
    return $?
}

#######################################################################
# Wrapper Functions for PostgreSQL Operations
# These detect container mode and route commands appropriately
#######################################################################

# Wrapper for pg_ctl
pg_ctl_wrapper() {
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        container_exec_as_postgres /usr/bin/pg_ctl "$@"
    else
        run_as_pguser "${PGCTL}" "$@"
    fi
    return $?
}

# Wrapper for psql
psql_wrapper() {
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        container_exec_as_postgres /usr/bin/psql "$@"
    else
        run_as_pguser "${PSQL}" "$@"
    fi
    return $?
}

# Wrapper for pg_rewind
pg_rewind_wrapper() {
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        container_exec_as_postgres /usr/bin/pg_rewind "$@"
    else
        run_as_pguser "${PG_REWIND}" "$@"
    fi
    return $?
}

# Wrapper for pg_basebackup
pg_basebackup_wrapper() {
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        container_exec_as_postgres /usr/bin/pg_basebackup "$@"
    else
        run_as_pguser "${PG_BASEBACKUP}" "$@"
    fi
    return $?
}

# Parse .pgpass file to get replication credentials
# Format: host:port:database:username:password
parse_pgpass() {
    local pgpass_file="${OCF_RESKEY_pgpassfile}"
    local rep_user=""
    local rep_host=""
    local local_node=$(get_cluster_node_name)
    local local_hostname=$(hostname)

    # Default to postgres user's .pgpass if not specified
    if [ -z "$pgpass_file" ]; then
        pgpass_file="/var/lib/${OCF_RESKEY_pguser}/.pgpass"
    fi

    if [ ! -f "$pgpass_file" ]; then
        ocf_log warn "pgpass file not found: $pgpass_file, using default 'replicator' user"
        echo "replicator:"
        return 1
    fi

    # Look for replication database entries, but exclude local node entries
    # Format: host:port:replication:username:password
    # v1.7.0: Filter out entries matching local node/hostname to prevent self-replication
    # v1.7.0: Only consider entries from node_list to prevent cross-cluster replication
    local entry=""
    while IFS= read -r line; do
        local entry_host=$(echo "$line" | cut -d: -f1)

        # Skip if entry matches local cluster node name or local hostname
        if [ "$entry_host" = "$local_node" ] || [ "$entry_host" = "$local_hostname" ]; then
            ocf_log debug "Skipping .pgpass entry for local node: $entry_host"
            continue
        fi

        # Skip if entry matches localhost or 127.0.0.1
        if [ "$entry_host" = "localhost" ] || [ "$entry_host" = "127.0.0.1" ]; then
            ocf_log debug "Skipping .pgpass entry for localhost: $entry_host"
            continue
        fi

        # BUG FIX v1.7.0: Only accept entries that are in node_list
        # This prevents cross-cluster replication when multiple PostgreSQL clusters run on same nodes
        if [ -n "${OCF_RESKEY_node_list}" ]; then
            local in_node_list=false
            # Strip domain for comparison (handles FQDN vs hostname mismatch)
            local entry_host_short="${entry_host%%.*}"
            for node in ${OCF_RESKEY_node_list}; do
                local node_short="${node%%.*}"
                if [ "$entry_host_short" = "$node_short" ]; then
                    in_node_list=true
                    break
                fi
            done

            if [ "$in_node_list" = "false" ]; then
                ocf_log debug "Skipping .pgpass entry not in node_list: $entry_host (node_list=${OCF_RESKEY_node_list})"
                continue
            fi
        fi

        # Found a valid remote host entry
        entry="$line"
        break
    done < <(grep ":replication:" "$pgpass_file")

    if [ -z "$entry" ]; then
        ocf_log warn "No remote replication entry found in $pgpass_file (filtered local node: $local_node/$local_hostname)"
        echo "replicator:"
        return 1
    fi

    # Parse the entry: host:port:database:username:password
    rep_host=$(echo "$entry" | cut -d: -f1)
    rep_user=$(echo "$entry" | cut -d: -f4)

    if [ -z "$rep_user" ]; then
        rep_user="replicator"
    fi

    ocf_log info "Parsed replication credentials from $pgpass_file: user=$rep_user, host=$rep_host"
    echo "${rep_user}:${rep_host}"
    return 0
}

# Get replication user from .pgpass or default
get_replication_user() {
    local creds=$(parse_pgpass)
    echo "$creds" | cut -d: -f1
}

# Get replication host from .pgpass or node list
get_replication_host() {
    local creds=$(parse_pgpass)
    local rep_host=$(echo "$creds" | cut -d: -f2)

    if [ -z "$rep_host" ] || [ "$rep_host" = "*" ]; then
        # Fallback to finding another node from node_list
        for node in ${OCF_RESKEY_node_list}; do
            if [ "$node" != "$(get_cluster_node_name)" ]; then
                echo "$node"
                return 0
            fi
        done
    fi

    echo "$rep_host"
}

# Check disk space before pg_basebackup
# Uses actual disk usage (including logs, temp files, etc.) instead of pg_database_size
check_disk_space_for_basebackup() {
    local primary_host="$1"
    local required_space_mb=0
    local available_space_mb=0
    local actual_disk_usage_mb=0
    local backup_multiplier=1

    ocf_log info "Checking disk space before pg_basebackup (backup_mode=${OCF_RESKEY_backup_before_basebackup})"

    # Get actual disk usage of PGDATA (includes everything: data, logs, WAL, temp files, etc.)
    # This is more accurate than pg_database_size() which only counts logical database size
    if [ -d "${PGDATA}" ]; then
        local disk_usage_bytes=$(du -sb "${PGDATA}" 2>/dev/null | awk '{print $1}')

        if [ -n "$disk_usage_bytes" ] && [ "$disk_usage_bytes" -gt 0 ]; then
            actual_disk_usage_mb=$((disk_usage_bytes / 1024 / 1024))
            ocf_log info "Actual PGDATA disk usage: ${actual_disk_usage_mb}MB (includes data, WAL, logs, etc.)"
        else
            # Fallback: try to get from primary
            ocf_log warn "Could not determine local disk usage, querying primary database size"
            local db_size_bytes=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U $(get_replication_user) -d postgres -Atc \"SELECT pg_database_size('postgres')\"" 2>/dev/null)

            if [ -n "$db_size_bytes" ] && [ "$db_size_bytes" -gt 0 ]; then
                actual_disk_usage_mb=$((db_size_bytes / 1024 / 1024))
                # Add 20% overhead for WAL, logs, temp files
                actual_disk_usage_mb=$((actual_disk_usage_mb * 120 / 100))
                ocf_log info "Estimated disk usage from primary DB size + 20% overhead: ${actual_disk_usage_mb}MB"
            else
                # Last resort: default estimate
                actual_disk_usage_mb=1024
                ocf_log warn "Using default estimate of 1GB for disk usage"
            fi
        fi
    else
        # PGDATA doesn't exist yet - use conservative estimate from primary
        ocf_log info "PGDATA does not exist, querying primary for size estimate"
        local db_size_bytes=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U $(get_replication_user) -d postgres -Atc \"SELECT pg_database_size('postgres')\"" 2>/dev/null)

        if [ -n "$db_size_bytes" ] && [ "$db_size_bytes" -gt 0 ]; then
            actual_disk_usage_mb=$((db_size_bytes / 1024 / 1024))
            actual_disk_usage_mb=$((actual_disk_usage_mb * 120 / 100))
            ocf_log info "Estimated disk usage: ${actual_disk_usage_mb}MB"
        else
            actual_disk_usage_mb=1024
            ocf_log warn "Using default estimate of 1GB"
        fi
    fi

    # Calculate required space based on backup mode
    if [ "${OCF_RESKEY_backup_before_basebackup}" = "true" ]; then
        # Backup mode: need space for old backup + new basebackup
        backup_multiplier=2
        required_space_mb=$((actual_disk_usage_mb * backup_multiplier))
        ocf_log info "Backup mode enabled: require ${backup_multiplier}× disk usage = ${required_space_mb}MB"
    else
        # No backup mode: only need space for new basebackup
        backup_multiplier=1
        required_space_mb=$((actual_disk_usage_mb * backup_multiplier))
        ocf_log info "No-backup mode: require ${backup_multiplier}× disk usage = ${required_space_mb}MB"
    fi

    # Add safety margin of 10% for WAL growth during basebackup
    local safety_margin_mb=$((required_space_mb / 10))
    required_space_mb=$((required_space_mb + safety_margin_mb))

    # Get available space on data directory filesystem
    local pgdata_parent=$(dirname "${PGDATA}")
    available_space_mb=$(df -BM "${pgdata_parent}" 2>/dev/null | tail -1 | awk '{print $4}' | sed 's/M//')

    if [ -z "$available_space_mb" ]; then
        ocf_log err "Could not determine available disk space"
        return 1
    fi

    ocf_log info "Disk space check: actual_usage=${actual_disk_usage_mb}MB, multiplier=${backup_multiplier}x, safety=+${safety_margin_mb}MB, required=${required_space_mb}MB, available=${available_space_mb}MB"

    if [ "$available_space_mb" -lt "$required_space_mb" ]; then
        ocf_log err "Insufficient disk space for pg_basebackup! Required: ${required_space_mb}MB, Available: ${available_space_mb}MB (backup_mode=${OCF_RESKEY_backup_before_basebackup})"
        return 1
    fi

    ocf_log info "Disk space check PASSED: ${available_space_mb}MB available, ${required_space_mb}MB required"
    return 0
}

# Ensure .pgpass file exists in standard location
# Sets global variable PGPASS_FILE on success
ensure_pgpass() {
    local standard_pgpass="/var/lib/pgsql/.pgpass"
    local pguser_group

    # If no pgpassfile parameter set, check if standard location exists
    if [ -z "${OCF_RESKEY_pgpassfile}" ]; then
        if [ -f "$standard_pgpass" ]; then
            ocf_log debug ".pgpass exists in standard location: $standard_pgpass"
            PGPASS_FILE="$standard_pgpass"
            return 0
        else
            ocf_log warn "No pgpassfile configured and $standard_pgpass does not exist"
            return 1
        fi
    fi

    # If pgpassfile parameter points to standard location, we're good
    if [ "${OCF_RESKEY_pgpassfile}" = "$standard_pgpass" ]; then
        if [ -f "$standard_pgpass" ]; then
            ocf_log debug ".pgpass exists in standard location: $standard_pgpass"
            PGPASS_FILE="$standard_pgpass"
            return 0
        else
            ocf_log err "Configured pgpassfile $standard_pgpass does not exist"
            return 1
        fi
    fi

    # pgpassfile points to non-standard location - copy to standard location
    if [ -f "${OCF_RESKEY_pgpassfile}" ]; then
        ocf_log info "Copying .pgpass from ${OCF_RESKEY_pgpassfile} to $standard_pgpass"
        cp "${OCF_RESKEY_pgpassfile}" "$standard_pgpass"

        # Get group name without command substitution
        id -gn ${OCF_RESKEY_pguser} > /tmp/.pgtwin_group.$$
        read pguser_group < /tmp/.pgtwin_group.$$
        rm -f /tmp/.pgtwin_group.$$

        chown ${OCF_RESKEY_pguser}:${pguser_group} "$standard_pgpass"
        chmod 600 "$standard_pgpass"

        if [ $? -eq 0 ]; then
            ocf_log info ".pgpass successfully copied to standard location"
            PGPASS_FILE="$standard_pgpass"
            return 0
        else
            ocf_log err "Failed to copy .pgpass to standard location"
            return 1
        fi
    else
        ocf_log err "Configured pgpassfile ${OCF_RESKEY_pgpassfile} does not exist"
        return 1
    fi
}

# Validate pg_hba.conf for replication and pg_rewind access
validate_pghba_config() {
    ocf_log info "Validating pg_hba.conf for replication access..."

    # Only check on promoted (primary) node
    if ! pgsql_is_promoted; then
        ocf_log debug "Not promoted, skipping pg_hba.conf validation"
        return 0
    fi

    local rep_user=$(get_replication_user)
    local has_errors=0
    local has_warnings=0

    # Determine expected network from node_list or make a suggestion
    local suggested_network=""
    if [ -n "${OCF_RESKEY_node_list}" ]; then
        # Try to extract network from first IP in node_list
        local first_node=$(echo "${OCF_RESKEY_node_list}" | awk '{print $1}')
        if [[ $first_node =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
            # It's an IP address - derive /24 network
            suggested_network=$(echo "$first_node" | sed 's/\.[0-9]*$/\.0\/24/')
        else
            suggested_network="<your_cluster_network>/24"
        fi
    else
        suggested_network="<your_cluster_network>/24"
    fi

    # Check for postgres database access (required for pg_rewind)
    local postgres_access=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"
        SELECT COUNT(*) FROM pg_hba_file_rules
        WHERE database @> '{postgres}'
        AND user_name @> '{${rep_user}}'
        AND type IN ('host', 'hostssl')
    \"" 2>/dev/null)

    if [ "$postgres_access" = "0" ] || [ -z "$postgres_access" ]; then
        ocf_log warn "pg_hba.conf WARNING: Missing entry for pg_rewind access"
        ocf_log warn "pg_rewind requires SQL access to 'postgres' database"
        ocf_log warn "Without this, recovery will always use slower pg_basebackup"
        ocf_log warn ""
        ocf_log warn "SUGGESTED FIX: Add to pg_hba.conf on PRIMARY (before restrictive rules):"
        ocf_log warn "  host    postgres        ${rep_user}      ${suggested_network}        scram-sha-256"
        ocf_log warn ""
        ocf_log warn "After adding, reload: sudo -u postgres psql -c \"SELECT pg_reload_conf();\""
        has_warnings=1
    else
        ocf_log info "✓ pg_hba.conf allows pg_rewind access (postgres database)"
    fi

    # Check for replication database access (required for streaming replication and pg_basebackup)
    local replication_access=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"
        SELECT COUNT(*) FROM pg_hba_file_rules
        WHERE database @> '{replication}'
        AND user_name @> '{${rep_user}}'
        AND type IN ('host', 'hostssl')
    \"" 2>/dev/null)

    if [ "$replication_access" = "0" ] || [ -z "$replication_access" ]; then
        ocf_log err "pg_hba.conf ERROR: Missing entry for replication access"
        ocf_log err "Streaming replication and pg_basebackup REQUIRE 'replication' database access"
        ocf_log err ""
        ocf_log err "REQUIRED FIX: Add to pg_hba.conf on PRIMARY (before restrictive rules):"
        ocf_log err "  host    replication     ${rep_user}      ${suggested_network}        scram-sha-256"
        ocf_log err ""
        ocf_log err "After adding, reload: sudo -u postgres psql -c \"SELECT pg_reload_conf();\""
        has_errors=1
    else
        ocf_log info "✓ pg_hba.conf allows replication access"
    fi

    # Additional helpful suggestions
    if [ $has_errors -ne 0 ] || [ $has_warnings -ne 0 ]; then
        ocf_log info ""
        ocf_log info "Complete pg_hba.conf example (add near top of file):"
        ocf_log info "  # Replication access for HA cluster"
        ocf_log info "  host    postgres        ${rep_user}      ${suggested_network}        scram-sha-256"
        ocf_log info "  host    replication     ${rep_user}      ${suggested_network}        scram-sha-256"
    fi

    if [ $has_errors -ne 0 ]; then
        return 1
    fi

    return 0
}

# Check PostgreSQL configuration for replication requirements
check_postgresql_config() {
    local config_ok=0
    local warnings=0
    local expected_app_name

    get_application_name
    expected_app_name="$APPLICATION_NAME"

    ocf_log info "Validating PostgreSQL configuration for replication..."

    # Check if PostgreSQL is running first
    if ! pgsql_is_running; then
        ocf_log warn "PostgreSQL not running, skipping configuration check"
        return 0
    fi

    # 1. Check wal_level
    local wal_level=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW wal_level\"" 2>/dev/null)
    if [ "$wal_level" != "replica" ] && [ "$wal_level" != "logical" ]; then
        ocf_log err "CONFIGURATION ERROR: wal_level='${wal_level}' - MUST be 'replica' or 'logical' for replication!"
        ocf_log err "FIX: Set 'wal_level = replica' in postgresql.conf"
        config_ok=1
    else
        ocf_log info "✓ wal_level='${wal_level}' (OK)"
    fi

    # 2. Check max_wal_senders
    local max_wal_senders=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW max_wal_senders\"" 2>/dev/null)
    if [ -n "$max_wal_senders" ] && [ "$max_wal_senders" -lt 2 ]; then
        ocf_log err "CONFIGURATION ERROR: max_wal_senders='${max_wal_senders}' - MUST be at least 2 for replication!"
        ocf_log err "FIX: Set 'max_wal_senders = 10' (or higher) in postgresql.conf"
        config_ok=1
    else
        ocf_log info "✓ max_wal_senders='${max_wal_senders}' (OK)"
    fi

    # 3. Check max_replication_slots
    local max_repl_slots=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW max_replication_slots\"" 2>/dev/null)
    if [ -n "$max_repl_slots" ] && [ "$max_repl_slots" -lt 2 ]; then
        ocf_log err "CONFIGURATION ERROR: max_replication_slots='${max_repl_slots}' - MUST be at least 2!"
        ocf_log err "FIX: Set 'max_replication_slots = 10' (or higher) in postgresql.conf"
        config_ok=1
    else
        ocf_log info "✓ max_replication_slots='${max_repl_slots}' (OK)"
    fi

    # 4. Check hot_standby
    local hot_standby=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW hot_standby\"" 2>/dev/null)
    if [ "$hot_standby" != "on" ]; then
        ocf_log warn "CONFIGURATION WARNING: hot_standby='${hot_standby}' - RECOMMENDED: 'on' for standby read queries"
        ocf_log warn "FIX: Set 'hot_standby = on' in postgresql.conf"
        warnings=$((warnings + 1))
    else
        ocf_log info "✓ hot_standby='${hot_standby}' (OK)"
    fi

    # 5. Check synchronous_commit (for sync replication mode)
    if [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
        local sync_commit=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW synchronous_commit\"" 2>/dev/null)
        if [ "$sync_commit" = "off" ] || [ "$sync_commit" = "local" ]; then
            ocf_log warn "CONFIGURATION WARNING: synchronous_commit='${sync_commit}' with rep_mode='sync'"
            ocf_log warn "RECOMMENDED: synchronous_commit='on' or 'remote_write' or 'remote_apply' for synchronous replication"
            ocf_log warn "FIX: Set 'synchronous_commit = on' in postgresql.conf"
            warnings=$((warnings + 1))
        else
            ocf_log info "✓ synchronous_commit='${sync_commit}' (OK for sync mode)"
        fi
    fi

    # 6. Check synchronous_standby_names
    local sync_standby_names=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW synchronous_standby_names\"" 2>/dev/null)

    if [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
        # Synchronous mode - should have standby configured
        if [ -z "$sync_standby_names" ] || [ "$sync_standby_names" = "" ]; then
            ocf_log warn "CONFIGURATION WARNING: synchronous_standby_names is EMPTY with rep_mode='sync'"
            ocf_log warn "RECOMMENDED: Set synchronous_standby_names to match application_name or use '*'"
            ocf_log warn "SUGGESTION: synchronous_standby_names = '*' (matches any standby)"
            ocf_log warn "OR: synchronous_standby_names = '${expected_app_name}' (specific standby)"
            ocf_log warn "FIX: Add to postgresql.conf or postgresql.auto.conf"
            warnings=$((warnings + 1))
        else
            ocf_log info "✓ synchronous_standby_names='${sync_standby_names}' (configured)"

            # Check if it matches expected application_name
            if [ "$sync_standby_names" != "*" ] && [ "$sync_standby_names" != "${expected_app_name}" ]; then
                # Check if it's in list format (e.g., "FIRST 1 (name1, name2)")
                if ! echo "$sync_standby_names" | grep -q "${expected_app_name}"; then
                    ocf_log warn "CONFIGURATION NOTE: synchronous_standby_names='${sync_standby_names}'"
                    ocf_log warn "Expected application_name='${expected_app_name}' not found in list"
                    ocf_log warn "Standby may not be synchronous if application_name doesn't match"
                    ocf_log warn "SUGGESTION: Use '*' to match any standby, or add '${expected_app_name}' to the list"
                    warnings=$((warnings + 1))
                fi
            fi
        fi
    else
        # Asynchronous mode - should be empty
        if [ -n "$sync_standby_names" ] && [ "$sync_standby_names" != "" ]; then
            ocf_log warn "CONFIGURATION NOTE: synchronous_standby_names='${sync_standby_names}' with rep_mode='async'"
            ocf_log warn "This will enforce synchronous replication despite rep_mode='async'"
            ocf_log warn "SUGGESTION: Clear synchronous_standby_names for pure async mode"
            warnings=$((warnings + 1))
        else
            ocf_log info "✓ synchronous_standby_names empty (OK for async mode)"
        fi
    fi

    # 7. Check restart_after_crash (CRITICAL for HA clusters)
    local restart_after_crash=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW restart_after_crash\"" 2>/dev/null)
    if [ "$restart_after_crash" != "off" ]; then
        ocf_log err "CRITICAL ERROR: restart_after_crash='${restart_after_crash}' - MUST be 'off' for Pacemaker-managed clusters!"
        ocf_log err "FIX: Set 'restart_after_crash = off' in postgresql.conf IMMEDIATELY"
        ocf_log err "REASON: PostgreSQL must not auto-restart after crash - Pacemaker manages lifecycle"
        ocf_log err "DANGER: If 'on', PostgreSQL will compete with Pacemaker, causing split-brain scenarios!"
        config_ok=1
    else
        ocf_log info "✓ restart_after_crash='${restart_after_crash}' (OK - Pacemaker manages restarts)"
    fi

    # 8. Check wal_sender_timeout (performance/reliability)
    # Query pg_settings to get value in base unit (milliseconds)
    local timeout_ms=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT setting::int FROM pg_settings WHERE name = 'wal_sender_timeout'\"" 2>/dev/null)
    if [ -n "$timeout_ms" ]; then
        if [ "$timeout_ms" -lt 10000 ]; then
            ocf_log warn "PERFORMANCE WARNING: wal_sender_timeout=${timeout_ms}ms (<10 seconds) is very aggressive"
            ocf_log warn "May cause false disconnections on network hiccups, GC pauses, or CPU spikes"
            ocf_log warn "RECOMMENDATION: Set to 15000-30000ms for production stability"
            ocf_log warn "FIX: Set 'wal_sender_timeout = 30000' in postgresql.conf"
            warnings=$((warnings + 1))
        else
            ocf_log info "✓ wal_sender_timeout=${timeout_ms}ms (OK)"
        fi
    fi

    # 9. Check max_standby_streaming_delay (replication lag control)
    local max_standby_delay=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW max_standby_streaming_delay\"" 2>/dev/null)
    if [ "$max_standby_delay" = "-1" ]; then
        ocf_log warn "REPLICATION WARNING: max_standby_streaming_delay=-1 allows unbounded replication lag"
        ocf_log warn "Long-running queries on standby will delay WAL replay indefinitely"
        ocf_log warn "Can cause replication lag to grow without limit"
        ocf_log warn "RECOMMENDATION: Set to 30000-60000 (30-60 seconds) for production"
        ocf_log warn "FIX: Set 'max_standby_streaming_delay = 60000' in postgresql.conf"
        ocf_log warn "TRADE-OFF: Long queries may be cancelled, but replication lag stays controlled"
        warnings=$((warnings + 1))
    else
        ocf_log info "✓ max_standby_streaming_delay='${max_standby_delay}' (OK)"
    fi

    # 10. Check archive_mode and archive_command (availability risk)
    local archive_mode=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW archive_mode\"" 2>/dev/null)
    if [ "$archive_mode" != "on" ]; then
        ocf_log info "INFO: archive_mode='${archive_mode}' (archiving disabled)"
        ocf_log info "OPTIONAL: Enable WAL archiving for point-in-time recovery"
        ocf_log info "  Set: archive_mode = on"
        ocf_log info "  Set: archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'"
    else
        ocf_log info "✓ archive_mode='${archive_mode}' (archiving enabled)"

        # Check archive_command for potential blocking issues
        local archive_command=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW archive_command\"" 2>/dev/null)
        if [ -n "$archive_command" ] && [ "$archive_command" != "/bin/true" ] && [ "$archive_command" != "" ]; then
            ocf_log info "Archive command: ${archive_command}"

            # Check if archive_command has error handling (contains '||' or '; true' or similar)
            if ! echo "$archive_command" | grep -qE '(\|\||;.*true|&&.*true)'; then
                ocf_log warn "AVAILABILITY WARNING: archive_command lacks error handling"
                ocf_log warn "If archiving fails (disk full, network down), PostgreSQL will BLOCK all writes"
                ocf_log warn "RECOMMENDATION: Add error handling to archive_command"
                ocf_log warn "Example: '${archive_command} || /bin/true' (fails gracefully)"
                ocf_log warn "Or disable: 'archive_mode = off' if PITR not needed"
                warnings=$((warnings + 1))
            fi
        fi
    fi

    # 11. Check listen_addresses (security notice)
    local listen_addresses=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW listen_addresses\"" 2>/dev/null)
    if [ "$listen_addresses" = "*" ]; then
        ocf_log info "SECURITY NOTICE: listen_addresses='*' (listening on all network interfaces)"
        ocf_log info "For production security, restrict to cluster network"
        ocf_log info "Example: listen_addresses = 'localhost,192.168.122.60'"
        ocf_log info "Note: pg_hba.conf provides additional access control"
    else
        ocf_log info "✓ listen_addresses='${listen_addresses}' (restricted)"
    fi

    # 12. Check if application_name is set in primary_conninfo (on standby)
    if pgsql_is_promoted; then
        ocf_log info "Instance is PRIMARY - configuration check complete"
    else
        # This is a standby - check primary_conninfo
        if [ -f "${PGDATA}/postgresql.auto.conf" ]; then
            local primary_conninfo=$(grep "primary_conninfo" "${PGDATA}/postgresql.auto.conf" 2>/dev/null)
            if [ -n "$primary_conninfo" ]; then
                if echo "$primary_conninfo" | grep -q "application_name"; then
                    local app_in_conninfo=$(echo "$primary_conninfo" | grep -o "application_name=[^ '\"]*" | cut -d= -f2)
                    if [ "$app_in_conninfo" = "${expected_app_name}" ]; then
                        ocf_log info "✓ primary_conninfo application_name='${app_in_conninfo}' (matches expected)"
                    else
                        ocf_log warn "CONFIGURATION WARNING: primary_conninfo application_name='${app_in_conninfo}'"
                        ocf_log warn "Expected: application_name='${expected_app_name}'"
                        ocf_log warn "This may cause synchronous_standby_names mismatch"
                        warnings=$((warnings + 1))
                    fi
                else
                    ocf_log warn "CONFIGURATION WARNING: primary_conninfo missing application_name parameter"
                    ocf_log warn "SUGGESTION: Add 'application_name=${expected_app_name}' to primary_conninfo"
                    warnings=$((warnings + 1))
                fi
            fi
        fi
        ocf_log info "Instance is STANDBY - configuration check complete"
    fi

    # Validate pg_hba.conf entries (on PRIMARY only)
    if pgsql_is_promoted; then
        validate_pghba_config
        local pghba_rc=$?
        if [ $pghba_rc -ne 0 ]; then
            config_ok=1
        fi
    fi

    # Summary
    if [ $config_ok -ne 0 ]; then
        ocf_log err "PostgreSQL configuration has ERRORS - replication may not work!"
        ocf_log err "Please fix the errors listed above and restart PostgreSQL"
        return 1
    elif [ $warnings -gt 0 ]; then
        ocf_log warn "PostgreSQL configuration check completed with ${warnings} warning(s)"
        ocf_log warn "Review warnings above to ensure optimal replication setup"
        return 0
    else
        ocf_log info "PostgreSQL configuration check PASSED - all settings OK"
        return 0
    fi
}

pgsql_is_running() {
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        # Container mode: check if container is running AND PostgreSQL is running inside
        if ! container_is_running; then
            return 1
        fi

        # Check if postmaster is running inside the container
        if container_exec test -f /var/lib/pgsql/data/postmaster.pid 2>/dev/null; then
            local PID=$(container_exec head -n 1 /var/lib/pgsql/data/postmaster.pid 2>/dev/null)
            if [ -n "$PID" ] && container_exec kill -0 "$PID" 2>/dev/null; then
                return 0
            fi
        fi
        return 1
    else
        # Bare-metal mode: traditional check
        if [ -f "${PGDATA}/postmaster.pid" ]; then
            PID=$(head -n 1 "${PGDATA}/postmaster.pid")
            if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then
                return 0
            fi
        fi
        return 1
    fi
}

pgsql_is_promoted() {
    if ! pgsql_is_running; then
        return 1
    fi

    # Check if this is a primary (not in recovery mode)
    local result=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT pg_is_in_recovery()\"" 2>/dev/null)

    if [ "$result" = "f" ]; then
        return 0  # Is promoted (primary)
    fi
    return 1  # Is standby
}

resolve_cluster_node_to_hostname() {
    # Resolve Pacemaker cluster node name to actual hostname/IP
    # This is needed when cluster node names differ from system hostnames
    local cluster_node="$1"

    # Try to get the node's hostname from Pacemaker
    if command -v crm_node >/dev/null 2>&1; then
        # Query Pacemaker for node attributes
        local node_hostname=$(crm_node -l 2>/dev/null | grep "^[0-9]* ${cluster_node} " | awk '{print $2}')

        if [ -n "$node_hostname" ] && [ "$node_hostname" != "$cluster_node" ]; then
            ocf_log debug "Resolved cluster node '${cluster_node}' to hostname '${node_hostname}'"
            echo "$node_hostname"
            return 0
        fi
    fi

    # Check if node_list has a mapping for this cluster node
    # node_list should contain resolvable hostnames or IPs
    if [ -n "${OCF_RESKEY_node_list}" ]; then
        # If cluster node is in node_list, return it (should be resolvable)
        for node in ${OCF_RESKEY_node_list}; do
            if [ "$node" = "$cluster_node" ]; then
                echo "$node"
                return 0
            fi
        done

        # If not found in node_list, assume first node in list matches
        # This is a weak fallback but better than nothing
        ocf_log warn "Cluster node '${cluster_node}' not in node_list, using first entry"
        echo "${OCF_RESKEY_node_list}" | awk '{print $1}'
        return 0
    fi

    # Last resort: return the cluster node name as-is (may not be resolvable)
    ocf_log warn "Cannot resolve cluster node '${cluster_node}' to hostname, using as-is"
    echo "$cluster_node"
    return 1
}

get_pg_config_value() {
    # Get PostgreSQL configuration value using postgres -C
    # This works with any config file structure (postgresql.conf, includes, custom files, etc.)
    # Usage: get_pg_config_value <parameter_name>
    # Returns: configuration value (empty string if not set or on error)

    local param_name="$1"
    local config_value=""

    # Use postgres -C to query configuration (works in both bare-metal and container mode)
    if [ "${OCF_RESKEY_container_mode}" = "true" ] || [ "${OCF_RESKEY_container_mode}" = "yes" ]; then
        # Container mode: run postgres -C inside container
        config_value=$(container_exec_as_postgres "$POSTGRES" -C "$param_name" -D "${PGDATA}" 2>/dev/null)
    else
        # Bare-metal mode: run postgres -C directly
        config_value=$(run_as_pguser "$POSTGRES" -C "$param_name" -D "${PGDATA}" 2>/dev/null)
    fi

    # Clean up the value (trim whitespace, remove quotes)
    config_value=$(echo "$config_value" | tr -d "' \"" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')

    echo "$config_value"
}

discover_promoted_node() {
    # Discover which node is currently promoted (primary)
    # Try multiple methods in order of reliability

    local promoted_node=""

    # Method 1: Use VIP if configured
    if [ -n "${OCF_RESKEY_vip}" ]; then
        ocf_log debug "Attempting to discover promoted node via VIP ${OCF_RESKEY_vip}"

        # Try to connect to VIP and check if it's promoted
        local is_primary=$(run_as_pguser sh -c "${PSQL} -h ${OCF_RESKEY_vip} -p ${OCF_RESKEY_pgport} -Atc \"SELECT NOT pg_is_in_recovery()\"" 2>/dev/null)

        if [ "$is_primary" = "t" ]; then
            # VIP responds and is primary, now get its hostname
            promoted_node=$(run_as_pguser sh -c "${PSQL} -h ${OCF_RESKEY_vip} -p ${OCF_RESKEY_pgport} -Atc \"SELECT split_part(inet_server_addr()::text, '.', 4)\"" 2>/dev/null)

            if [ -z "$promoted_node" ]; then
                # Fallback: just use VIP directly
                promoted_node="${OCF_RESKEY_vip}"
            fi

            ocf_log info "Discovered promoted node via VIP: ${promoted_node}"
            echo "$promoted_node"
            return 0
        fi
    fi

    # Method 2: Query each node in node_list
    if [ -n "${OCF_RESKEY_node_list}" ]; then
        ocf_log debug "Attempting to discover promoted node via node_list"

        for node in ${OCF_RESKEY_node_list}; do
            local is_primary=$(run_as_pguser sh -c "${PSQL} -h ${node} -p ${OCF_RESKEY_pgport} -Atc \"SELECT NOT pg_is_in_recovery()\"" 2>/dev/null)

            if [ "$is_primary" = "t" ]; then
                promoted_node="$node"
                ocf_log info "Discovered promoted node via node_list: ${promoted_node}"
                echo "$promoted_node"
                return 0
            fi
        done
    fi

    # Method 3: Query Pacemaker CIB (as fallback)
    if command -v crm_mon >/dev/null 2>&1; then
        ocf_log debug "Attempting to discover promoted node via Pacemaker CIB"

        # Extract base resource name from instance (e.g., postgres-db-18:0 → postgres-db-18)
        local resource_name=$(echo "$OCF_RESOURCE_INSTANCE" | sed 's/:[0-9]*$//')

        # Find promoted node within THIS clone resource only (not all clones)
        # This prevents finding promoted nodes from other parallel PostgreSQL clusters
        local cluster_node=$(crm_mon -r -1 2>/dev/null | awk -v res="$resource_name" '
            /Clone Set:/ { in_our_clone=0 }
            $0 ~ "Clone Set:.*\\[" res "\\]" { in_our_clone=1 }
            in_our_clone && /Promoted:/ {
                gsub(/[\[\]]/,"")
                print $NF
                exit
            }
        ')

        if [ -n "$cluster_node" ]; then
            ocf_log info "Discovered promoted cluster node via Pacemaker CIB: ${cluster_node}"

            # Resolve cluster node name to actual hostname/IP
            promoted_node=$(resolve_cluster_node_to_hostname "$cluster_node")

            if [ $? -eq 0 ]; then
                ocf_log info "Resolved to connectable hostname: ${promoted_node}"
                echo "$promoted_node"
                return 0
            else
                ocf_log warn "Could not resolve cluster node '${cluster_node}' to hostname"
            fi
        fi
    fi

    ocf_log warn "Could not discover promoted node via any method"
    return 1
}

check_replication_health() {
    # Check if standby is properly replicating from primary
    # Returns 0 if healthy, 1 if unhealthy

    if pgsql_is_promoted; then
        # Primary doesn't need replication health check
        return 0
    fi

    # Check if standby is receiving WAL
    local wal_receiver_status=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT status FROM pg_stat_wal_receiver\"" 2>/dev/null)

    if [ "$wal_receiver_status" = "streaming" ]; then
        ocf_log debug "Replication healthy: WAL receiver status = streaming"
        return 0
    elif [ -z "$wal_receiver_status" ]; then
        ocf_log warn "Replication unhealthy: No WAL receiver process running"
        return 1
    else
        ocf_log warn "Replication unhealthy: WAL receiver status = ${wal_receiver_status}"
        return 1
    fi
}

pgsql_monitor() {
    local rc

    # Check if basebackup is in progress (for status reporting)
    # BUG FIX v1.6.6: Check state directory, not PGDATA
    local state_dir=$(dirname "${PGDATA}")
    if [ -f "${state_dir}/.pgtwin_basebackup_in_progress" ]; then
        check_basebackup_progress
        # Still in progress, return not running so resource won't be started yet
        if [ -f "${state_dir}/.pgtwin_basebackup_in_progress" ]; then
            ocf_log info "Basebackup still in progress, instance not ready"
            return $OCF_NOT_RUNNING
        fi
    fi

    # OPTIMIZATION: If basebackup just completed (rc file very recent), trigger resource cleanup
    # This makes Pacemaker retry the start operation immediately instead of waiting
    local rc_file="${state_dir}/.pgtwin_basebackup_rc"
    if [ -f "${rc_file}" ]; then
        # Calculate file age without command substitution
        local now_ts rc_file_ts rc_age
        date +%s > /tmp/.pgtwin_now.$$
        read now_ts < /tmp/.pgtwin_now.$$
        stat -c %Y "${rc_file}" 2>/dev/null > /tmp/.pgtwin_rctime.$$ || echo 0 > /tmp/.pgtwin_rctime.$$
        read rc_file_ts < /tmp/.pgtwin_rctime.$$
        rm -f /tmp/.pgtwin_now.$$ /tmp/.pgtwin_rctime.$$
        rc_age=$((now_ts - rc_file_ts))

        if [ "$rc_age" -lt 30 ]; then
            # Basebackup completed less than 30 seconds ago
            local bb_rc
            read bb_rc < "${rc_file}" 2>/dev/null || bb_rc="1"
            rm -f "${rc_file}"  # Clean up tracking file

            if [ "$bb_rc" -eq 0 ]; then
                ocf_log info "Basebackup successfully completed (${rc_age}s ago), triggering resource cleanup to start PostgreSQL"
                # Trigger Pacemaker to cleanup and retry start operation
                # Use background job to avoid blocking monitor
                (crm_resource --resource ${OCF_RESOURCE_INSTANCE} --cleanup >/dev/null 2>&1 &)
            else
                ocf_log err "Basebackup failed (${rc_age}s ago, exit code ${bb_rc})"
            fi
        fi
    fi

    if ! pgsql_is_running; then
        return $OCF_NOT_RUNNING
    fi

    # Try to connect
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c 'SELECT 1' >/dev/null 2>&1"
    rc=$?

    if [ $rc -ne 0 ]; then
        ocf_log err "PostgreSQL is running but not responding to queries"
        return $OCF_ERR_GENERIC
    fi

    # Check if promoted or unpromoted
    if pgsql_is_promoted; then
        # Check replication slots on primary
        check_replication_slots

        # DYNAMIC UPDATE v1.6.18: Update synchronous_standby_names based on cluster topology
        # This adapts to standbys connecting/disconnecting and cluster changes
        if [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
            local current_sync=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW synchronous_standby_names\"" 2>/dev/null)

            # Skip if admin uses advanced syntax (FIRST, ANY)
            if ! echo "$current_sync" | grep -qE '^FIRST |^ANY '; then
                # Get safe standby names based on current cluster topology + connections
                local safe_sync=$(get_safe_synchronous_standby_names)

                if [ "$current_sync" != "$safe_sync" ]; then
                    ocf_log info "Monitor: Updating synchronous_standby_names due to cluster topology change"
                    ocf_log info "Monitor: Old value: '$current_sync'"
                    ocf_log info "Monitor: New value: '$safe_sync'"

                    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \
                        \"ALTER SYSTEM SET synchronous_standby_names = '${safe_sync}'\"" >/dev/null 2>&1

                    if [ $? -eq 0 ]; then
                        run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c 'SELECT pg_reload_conf()'" >/dev/null 2>&1
                        ocf_log info "Monitor: synchronous_standby_names updated successfully"
                    else
                        ocf_log warn "Monitor: Failed to update synchronous_standby_names"
                    fi
                fi
            else
                ocf_log debug "Monitor: synchronous_standby_names uses advanced syntax, skipping auto-update"
            fi
        fi

        # Check for archive failures (can cause cluster to block)
        local archive_mode=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SHOW archive_mode\"" 2>/dev/null)
        if [ "$archive_mode" = "on" ]; then
            local failed_archives=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT failed_count FROM pg_stat_archiver\"" 2>/dev/null)
            local last_failed_time=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT last_failed_time FROM pg_stat_archiver WHERE last_failed_time IS NOT NULL\"" 2>/dev/null)

            if [ -n "$failed_archives" ] && [ "$failed_archives" -gt 0 ]; then
                if [ -n "$last_failed_time" ]; then
                    ocf_log warn "Archive failures detected: ${failed_archives} total failures, last at ${last_failed_time}"
                else
                    ocf_log warn "Archive failures detected: ${failed_archives} total failures"
                fi
                ocf_log warn "Check archive_command and destination availability"
                ocf_log warn "If archiving continues to fail, WAL will accumulate and may fill disk"
            fi
        fi

        return $OCF_RUNNING_PROMOTED
    else
        # Enhanced replication health monitoring for standby (v1.6)
        check_replication_health
        local repl_health=$?

        if [ $repl_health -eq 0 ]; then
            # Replication is healthy, reset failure counter
            ocf_run crm_attribute -N $(get_cluster_node_name) -n postgres-replication-failures -v 0 -l reboot 2>/dev/null
            return $OCF_SUCCESS
        else
            # Replication is unhealthy, increment failure counter
            local failure_count=$(crm_attribute -N $(get_cluster_node_name) -n postgres-replication-failures -G -q -d 0 -l reboot 2>/dev/null || echo "0")
            failure_count=$((failure_count + 1))
            ocf_run crm_attribute -N $(get_cluster_node_name) -n postgres-replication-failures -v $failure_count -l reboot 2>/dev/null

            ocf_log warn "Replication failure detected (count: ${failure_count}/${OCF_RESKEY_replication_failure_threshold})"

            # Check if threshold exceeded
            if [ $failure_count -ge ${OCF_RESKEY_replication_failure_threshold} ]; then
                ocf_log err "Replication failure threshold (${OCF_RESKEY_replication_failure_threshold}) exceeded"
                ocf_log err "Triggering automatic recovery (pg_rewind/pg_basebackup)"

                # Discover promoted node for recovery
                local primary_host=$(discover_promoted_node)

                if [ -n "$primary_host" ]; then
                    ocf_log info "Initiating recovery from primary: ${primary_host}"

                    # Reset failure counter before recovery attempt
                    ocf_run crm_attribute -N $(get_cluster_node_name) -n postgres-replication-failures -v 0 -l reboot 2>/dev/null

                    # Trigger recovery (this will stop PostgreSQL, run pg_rewind/basebackup, and restart)
                    recover_standby "$primary_host"
                    local recovery_rc=$?

                    if [ $recovery_rc -eq 0 ]; then
                        ocf_log info "Automatic replication recovery completed successfully"
                        return $OCF_SUCCESS
                    else
                        ocf_log err "Automatic replication recovery failed (rc=$recovery_rc)"
                        return $OCF_ERR_GENERIC
                    fi
                else
                    ocf_log err "Cannot trigger recovery: unable to discover promoted node"
                    return $OCF_ERR_GENERIC
                fi
            fi

            # Haven't reached threshold yet, still return success but log warning
            return $OCF_SUCCESS
        fi
    fi
}

check_replication_slots() {
    # Check replication slot size and cleanup if needed
    # v1.6.9: Enhanced with database size comparison and configurable disable
    if [ -z "${OCF_RESKEY_slot_name}" ]; then
        return 0
    fi

    # Special value 0 = disable automatic slot dropping
    if [ "${OCF_RESKEY_max_slot_wal_keep_size}" = "0" ]; then
        ocf_log debug "Automatic slot dropping disabled (max_slot_wal_keep_size=0)"
        return 0
    fi

    # Get slot size in MB
    local slot_size=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"
        SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) / 1024 / 1024
        FROM pg_replication_slots
        WHERE slot_name='${OCF_RESKEY_slot_name}'
    \"" 2>/dev/null)

    if [ -z "$slot_size" ] || [ "$slot_size" = "" ]; then
        # Slot doesn't exist or query failed
        return 0
    fi

    # Convert to integer for comparison
    slot_size=$(printf "%.0f" "$slot_size" 2>/dev/null)

    # Minimum slot size threshold: never drop slots smaller than this
    # Prevents dropping fresh slots in development environments with tiny databases
    local min_drop_threshold=500  # MB

    if [ $(echo "$slot_size < $min_drop_threshold" | bc 2>/dev/null || echo 0) -eq 1 ]; then
        ocf_log debug "Replication slot ${OCF_RESKEY_slot_name} size ${slot_size} MB below minimum drop threshold (${min_drop_threshold} MB) - not dropping"
        return 0
    fi

    local max_size=${OCF_RESKEY_max_slot_wal_keep_size}
    local should_drop=false
    local drop_reason=""

    # Check 1: Exceeds configured threshold
    if [ $(echo "$slot_size > $max_size" | bc 2>/dev/null || echo 0) -eq 1 ]; then
        should_drop=true
        drop_reason="exceeds configured max_slot_wal_keep_size ($slot_size MB > $max_size MB)"
    fi

    # Check 2: Slot size exceeds database size (additional safety check)
    # Query database size in MB
    local db_size_mb=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"
        SELECT pg_database_size('postgres') / 1024 / 1024
    \"" 2>/dev/null)

    if [ -n "$db_size_mb" ] && [ "$db_size_mb" != "" ]; then
        db_size_mb=$(printf "%.0f" "$db_size_mb" 2>/dev/null)

        if [ $(echo "$slot_size > $db_size_mb" | bc 2>/dev/null || echo 0) -eq 1 ]; then
            ocf_log warn "Replication slot size ($slot_size MB) exceeds database size ($db_size_mb MB)"
            ocf_log warn "This indicates standby has been offline for a very long time"
            ocf_log warn "pg_basebackup would be more efficient than replaying all accumulated WAL"

            # Override should_drop even if under configured threshold
            should_drop=true
            drop_reason="exceeds database size ($slot_size MB > $db_size_mb MB) - pg_basebackup more efficient"
        fi
    else
        ocf_log debug "Could not determine database size for slot comparison"
    fi

    # Drop slot if any condition met
    if [ "$should_drop" = "true" ]; then
        ocf_log warn "Replication slot ${OCF_RESKEY_slot_name} will be dropped: ${drop_reason}"
        ocf_log info "Standby will be automatically recovered via pg_basebackup when it reconnects"

        # Drop the replication slot
        run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"
            SELECT pg_drop_replication_slot('${OCF_RESKEY_slot_name}')
        \"" 2>/dev/null

        local drop_rc=$?

        if [ $drop_rc -eq 0 ]; then
            ocf_log info "Replication slot ${OCF_RESKEY_slot_name} dropped successfully"

            # Mark that full resync is needed
            run_as_pguser touch "${PGDATA}/.need_full_resync"
            ocf_log info "Created .need_full_resync marker - standby will trigger pg_basebackup"
        else
            ocf_log err "Failed to drop replication slot ${OCF_RESKEY_slot_name} (rc=$drop_rc)"
        fi
    else
        ocf_log debug "Replication slot ${OCF_RESKEY_slot_name} size OK: ${slot_size} MB (max: ${max_size} MB)"
    fi
}

cleanup_old_replication_slot() {
    # When a former primary comes back as standby, cleanup its old replication slot
    if pgsql_is_running && ! pgsql_is_promoted; then
        ocf_log info "Cleaning up replication slot on former primary (now standby)"
        # The primary will do the cleanup
    fi
}

is_valid_pgdata() {
    # Check if PGDATA exists and contains a valid PostgreSQL cluster
    # Returns 0 if valid, 1 if empty/missing/invalid
    # Note: This does NOT check version compatibility - use validate_pg_version() separately

    if [ ! -d "${PGDATA}" ]; then
        ocf_log debug "PGDATA does not exist: ${PGDATA}"
        return 1
    fi

    # Check if directory is empty
    if [ -z "$(ls -A ${PGDATA} 2>/dev/null)" ]; then
        ocf_log debug "PGDATA is empty: ${PGDATA}"
        return 1
    fi

    # Check for PG_VERSION file (present in all valid PostgreSQL clusters)
    if [ ! -f "${PGDATA}/PG_VERSION" ]; then
        ocf_log warn "PGDATA missing PG_VERSION file (corrupted or incomplete): ${PGDATA}"
        return 1
    fi

    ocf_log debug "PGDATA is valid: ${PGDATA}"
    return 0
}

validate_pg_version() {
    # Validate PostgreSQL version matches between data directory and binary/container
    # Returns 0 if valid or no data exists, 1 if version mismatch

    local data_version=""
    local expected_version=""

    # Get data version from PG_VERSION file
    if [ -f "${PGDATA}/PG_VERSION" ]; then
        data_version=$(cat "${PGDATA}/PG_VERSION" 2>/dev/null | tr -d '[:space:]')
        ocf_log debug "Data directory PostgreSQL version: ${data_version}"
    else
        # No data exists yet - validation passes (will be initialized)
        ocf_log debug "No PG_VERSION file found - assuming new installation"
        return 0
    fi

    # Get expected version based on mode
    if [ "${OCF_RESKEY_container_mode}" = "true" ] || [ "${OCF_RESKEY_container_mode}" = "yes" ]; then
        # Container mode: use configured pg_major_version
        if [ -n "${OCF_RESKEY_pg_major_version}" ]; then
            expected_version="${OCF_RESKEY_pg_major_version}"
            ocf_log debug "Container mode: Expected PostgreSQL version from config: ${expected_version}"
        else
            ocf_log err "Container mode enabled but pg_major_version not configured"
            return 1
        fi
    else
        # Bare-metal mode: get version from installed PostgreSQL binary
        # Source container library if not already loaded
        if ! type pgtwin_get_pg_major_version >/dev/null 2>&1; then
            if [ -f /usr/lib/ocf/lib/heartbeat/pgtwin-container-lib.sh ]; then
                . /usr/lib/ocf/lib/heartbeat/pgtwin-container-lib.sh
            else
                ocf_log info "Container library not available (container mode is experimental)"
                # Fallback: try to get version from binary directly
                local pg_binary="${OCF_RESKEY_pg_bindir}/postgres"
                if [ ! -x "$pg_binary" ]; then
                    # Try fallback locations
                    for fallback in /usr/bin/postgres /usr/pgsql-*/bin/postgres; do
                        if [ -x "$fallback" ]; then
                            pg_binary="$fallback"
                            break
                        fi
                    done
                fi
                local version_output=$($pg_binary --version 2>/dev/null)
                expected_version=$(echo "$version_output" | grep -oP '\d+' | head -n 1)
            fi
        fi

        if type pgtwin_get_pg_major_version >/dev/null 2>&1; then
            expected_version=$(pgtwin_get_pg_major_version)
        fi

        if [ -z "$expected_version" ]; then
            ocf_log err "Cannot determine PostgreSQL binary version"
            return 1
        fi
        ocf_log debug "Bare-metal mode: Expected PostgreSQL version from binary: ${expected_version}"
    fi

    # Compare versions
    if [ "${data_version}" != "${expected_version}" ]; then
        ocf_log err "PostgreSQL version mismatch detected!"
        ocf_log err "  Data directory version: ${data_version} (${PGDATA}/PG_VERSION)"
        ocf_log err "  Expected version: ${expected_version}"
        ocf_log err ""
        ocf_log err "This mismatch can cause:"
        ocf_log err "  - Startup failures"
        ocf_log err "  - Data corruption"
        ocf_log err "  - Incompatibility errors"
        ocf_log err ""
        ocf_log err "Resolution options:"
        if [ "${OCF_RESKEY_container_mode}" = "true" ] || [ "${OCF_RESKEY_container_mode}" = "yes" ]; then
            ocf_log err "  1. Use matching PostgreSQL version (set pg_major_version='${data_version}' in cluster config)"
        else
            ocf_log err "  1. Install PostgreSQL ${data_version} package to match existing data"
        fi
        ocf_log err "  2. Backup data and reinitialize with PostgreSQL ${expected_version}"
        ocf_log err "  3. Perform pg_upgrade (manual procedure, requires cluster stop)"
        return 1
    fi

    ocf_log info "PostgreSQL version validation passed: ${expected_version}"
    return 0
}

pgsql_start() {
    local rc

    if pgsql_is_running; then
        ocf_log info "PostgreSQL is already running"
        # Perform configuration check on already running instance
        check_postgresql_config
        return $OCF_SUCCESS
    fi

    # Check if basebackup is in progress from previous start attempt
    # BUG FIX v1.6.6: Check state directory, not PGDATA
    local state_dir=$(dirname "${PGDATA}")
    if [ -f "${state_dir}/.pgtwin_basebackup_in_progress" ]; then
        ocf_log info "Basebackup already in progress, checking status"
        check_basebackup_progress
        if [ -f "${state_dir}/.pgtwin_basebackup_in_progress" ]; then
            ocf_log info "Basebackup still in progress, instance not ready"
            return $OCF_NOT_RUNNING
        fi
        # Basebackup completed, continue to start PostgreSQL
        ocf_log info "Basebackup completed, proceeding to start PostgreSQL"
    fi

    # Auto-initialization: Check if PGDATA is empty/missing/invalid
    if ! is_valid_pgdata; then
        ocf_log info "PGDATA is empty or invalid - triggering automatic standby initialization"

        # Ensure .pgpass exists
        ensure_pgpass
        if [ $? -ne 0 ]; then
            ocf_log err "Cannot initialize standby: .pgpass file not configured or invalid"
            ocf_log err "Please create ${OCF_RESKEY_pgpassfile} with replication credentials"
            return $OCF_ERR_CONFIGURED
        fi
        local pgpass_file="$PGPASS_FILE"

        # Discover primary node from Pacemaker cluster
        local primary_host=$(discover_promoted_node)
        if [ -z "$primary_host" ]; then
            ocf_log err "Cannot initialize standby: unable to discover primary node from cluster"
            ocf_log err "Ensure the primary node is running and promoted"
            return $OCF_ERR_GENERIC
        fi

        # Get replication user from .pgpass
        local rep_user=$(get_replication_user)
        if [ -z "$rep_user" ]; then
            ocf_log err "Cannot initialize standby: unable to determine replication user from .pgpass"
            ocf_log err "Check ${pgpass_file} contains entry for ${primary_host}"
            return $OCF_ERR_CONFIGURED
        fi

        # Check disk space before proceeding
        if ! check_disk_space_for_basebackup "${primary_host}"; then
            ocf_log err "Cannot initialize standby: insufficient disk space for pg_basebackup"
            return $OCF_ERR_GENERIC
        fi

        ocf_log info "Auto-initializing standby from primary: ${primary_host} (user: ${rep_user})"

        # Create empty PGDATA with correct ownership if needed
        if [ ! -d "${PGDATA}" ]; then
            mkdir -p "${PGDATA}" || {
                ocf_log err "Failed to create PGDATA directory: ${PGDATA}"
                return $OCF_ERR_GENERIC
            }
            chmod 750 "${PGDATA}"
        fi

        # CRITICAL: pg_basebackup requires PGDATA to be completely empty
        # Check for any leftover files and fail with clear error
        find "${PGDATA}" -mindepth 1 -maxdepth 1 2>/dev/null | wc -l > /tmp/.pgtwin_filecount.$$
        local file_count
        read file_count < /tmp/.pgtwin_filecount.$$
        rm -f /tmp/.pgtwin_filecount.$$

        if [ "$file_count" -gt 0 ]; then
            ocf_log err "Cannot run pg_basebackup: PGDATA directory is not empty"
            ocf_log err "Found $file_count file(s) in ${PGDATA}"
            ocf_log err "pg_basebackup requires a completely empty directory"
            ocf_log err "Please manually verify and remove contents: rm -rf ${PGDATA}/* ${PGDATA}/.*"
            return $OCF_ERR_GENERIC
        fi

        chown ${OCF_RESKEY_pguser}:${OCF_RESKEY_pguser} "${PGDATA}"
        chmod 700 "${PGDATA}"

        # Start asynchronous pg_basebackup (existing function)
        # This creates .pgtwin_basebackup_in_progress marker in state directory
        start_async_basebackup "${primary_host}" "${rep_user}"
        local bb_rc=$?

        if [ $bb_rc -eq 0 ]; then
            ocf_log info "Automatic standby initialization started (pg_basebackup running in background)"
            ocf_log info "Monitor function will track progress and start PostgreSQL when ready"
            return $OCF_NOT_RUNNING
        else
            ocf_log err "Failed to start automatic standby initialization"
            return $OCF_ERR_GENERIC
        fi
    fi

    # Version validation: If PGDATA exists with data, validate version compatibility
    # This check is critical to prevent:
    # - Starting PostgreSQL 17 container with PostgreSQL 18 data
    # - Starting PostgreSQL 18 binary with PostgreSQL 17 data
    if [ -f "${PGDATA}/PG_VERSION" ]; then
        if ! validate_pg_version; then
            ocf_log err "Cannot start PostgreSQL: version mismatch between data and binary/container"
            return $OCF_ERR_CONFIGURED
        fi
    fi

    # Container mode: Ensure container is running before starting PostgreSQL
    if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
        if type pgtwin_ensure_container_running >/dev/null 2>&1; then
            ocf_log info "Container mode enabled, ensuring container is running"
            pgtwin_ensure_container_running || {
                ocf_log err "Failed to start container"
                return $OCF_ERR_GENERIC
            }
        else
            ocf_log err "Container mode enabled but pgtwin_ensure_container_running not available"
            return $OCF_ERR_CONFIGURED
        fi
    fi

    # Check if we need to do recovery
    if [ -f "${PGDATA}/standby.signal" ]; then
        ocf_log info "Starting PostgreSQL as standby"

        # Non-blocking timeline divergence warning (best effort)
        # This helps admins detect timeline issues before PostgreSQL startup fails
        local local_timeline=""
        if command -v pg_controldata >/dev/null 2>&1; then
            local_timeline=$(run_as_pguser pg_controldata "${PGDATA}" 2>/dev/null | \
                grep "Latest checkpoint's TimeLineID:" | awk '{print $NF}')
        fi

        if [ -n "$local_timeline" ]; then
            ocf_log info "Local timeline: ${local_timeline}"

            # Try to find a node to query for timeline comparison (non-blocking, best effort)
            # Method 1: Try Pacemaker-discovered promoted node first (most reliable)
            local node_to_check=$(discover_promoted_node)

            # Method 2: If no promoted node found (e.g., during failover/demotion), try other cluster nodes
            # PostgreSQL may still be running on them even if not marked "Promoted" yet
            if [ -z "$node_to_check" ]; then
                ocf_log debug "No promoted node found via Pacemaker, trying other cluster nodes"
                local current_node=$(get_cluster_node_name)
                for node in ${OCF_RESKEY_node_list}; do
                    if [ "$node" != "$current_node" ]; then
                        node_to_check="$node"
                        ocf_log debug "Will try to query timeline from cluster node: ${node_to_check}"
                        break
                    fi
                done
            fi

            if [ -n "$node_to_check" ]; then
                local rep_user=$(get_replication_user)
                if [ -n "$rep_user" ]; then
                    # Try to query timeline from the node (works even if being demoted)
                    local other_timeline=$(run_as_pguser sh -c \
                        "${PSQL} -h ${node_to_check} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc \
                        'SELECT timeline_id FROM pg_control_checkpoint()' 2>/dev/null" || echo "")

                    if [ -n "$other_timeline" ]; then
                        ocf_log info "Other node timeline: ${other_timeline} (${node_to_check})"

                        if [ "$local_timeline" != "$other_timeline" ]; then
                            local current_node=$(get_cluster_node_name)

                            ocf_log warn "⚠️  TIMELINE DIVERGENCE DETECTED ⚠️
This node (${current_node}) timeline: ${local_timeline}
Other node (${node_to_check}) timeline: ${other_timeline}
Attempting automatic recovery (pg_rewind → pg_basebackup if safe)..."

                            # Stop PostgreSQL before recovery
                            if pgsql_is_running; then
                                pgsql_stop
                            fi

                            # Call recover_standby which tries pg_rewind first, then pg_basebackup
                            # We'll parse the result to determine if it was safe
                            recover_standby "${node_to_check}"
                            local recovery_rc=$?

                            if [ $recovery_rc -eq 0 ]; then
                                ocf_log info "✅ Automatic recovery succeeded - continuing startup"
                                # PostgreSQL is stopped after recovery, will be started by normal flow below
                            else
                                # Recovery failed - this should be rare with the fallback logic
                                ocf_log err "❌ Automatic recovery failed"
                                ocf_log err "Manual intervention required - check logs and data directory"
                                return $OCF_ERR_GENERIC
                            fi
                        fi
                    else
                        ocf_log debug "Could not query timeline from ${node_to_check} (may be stopped or unreachable)"
                    fi
                else
                    ocf_log debug "Could not determine replication user for timeline check"
                fi
            else
                ocf_log debug "No other cluster node available for timeline comparison"
            fi
        fi

        # Safety check: Verify standby configuration before starting
        # This catches cases where manual pg_basebackup was done or config was corrupted
        if [ -f "${PGDATA}/postgresql.auto.conf" ]; then
            local primary_conninfo=$(grep "^primary_conninfo" "${PGDATA}/postgresql.auto.conf" | cut -d= -f2- | tr -d "' ")

            # Check for common issues:
            # 1. Empty host= or user= (bug #2063-2071 before fix)
            # 2. Missing application_name
            # 3. Wrong passfile location
            if echo "$primary_conninfo" | grep -qE "host=\s|user=\s|host=$|user=$" || \
               ! echo "$primary_conninfo" | grep -q "application_name="; then
                ocf_log warn "Detected potentially incorrect standby configuration - attempting to fix"

                # Try to discover primary node to fix configuration
                local discovered_primary=$(discover_promoted_node)
                local rep_user=$(get_replication_user)

                if [ -n "$discovered_primary" ] && [ -n "$rep_user" ]; then
                    ocf_log info "Auto-fixing standby config: primary=${discovered_primary}, user=${rep_user}"
                    finalize_standby_config "${discovered_primary}" "${rep_user}"
                    if [ $? -ne 0 ]; then
                        ocf_log warn "Failed to auto-fix standby configuration - will attempt start anyway"
                    fi
                else
                    ocf_log warn "Cannot auto-fix standby config: unable to discover primary node or replication user"
                fi
            fi
        else
            ocf_log warn "Standby node missing postgresql.auto.conf - this may indicate incomplete pg_basebackup"
        fi
    else
        ocf_log info "Starting PostgreSQL as primary"

        # Single-node startup safety check (v1.6.14+)
        # Detect if we're starting alone with sync replication configured
        # This indicates a potential double-failure scenario where wrong node may be promoting
        if command -v crm_node >/dev/null 2>&1; then
            local node_count=$(crm_node -l 2>/dev/null | grep -c member)

            if [ "$node_count" -eq 1 ]; then
                # We're the only node in cluster - check sync configuration
                # Use postgres -C to query config (works with any config file structure)
                local sync_standby=$(get_pg_config_value "synchronous_standby_names")

                if [ -n "$sync_standby" ]; then
                    # Sync replication is configured but we're alone - DANGER!
                    ocf_log err "⚠️  SINGLE-NODE STARTUP WITH SYNC REPLICATION CONFIGURED ⚠️"
                    ocf_log err "⚠️  synchronous_standby_names='${sync_standby}'"
                    ocf_log err ""
                    ocf_log err "This configuration suggests one of these scenarios:"
                    ocf_log err "  1. Both nodes crashed (double failure)"
                    ocf_log err "  2. This node crashed first (other node may have newer data)"
                    ocf_log err "  3. This node is stale (other node was promoted and disabled sync)"
                    ocf_log err ""
                    ocf_log err "If this node was the primary when other node failed, pgtwin notify support"
                    ocf_log err "would have automatically disabled sync replication. Since sync is still"
                    ocf_log err "enabled, this node was likely NOT handling the failure."
                    ocf_log err ""
                    ocf_log err "REFUSING to start as primary - other node may have newer data!"
                    ocf_log err ""
                    ocf_log err "Recovery options:"
                    ocf_log err "  1. WAIT for other node to join cluster (RECOMMENDED)"
                    ocf_log err "  2. Check timelines on both nodes:"
                    ocf_log err "     pg_controldata ${PGDATA} | grep timeline"
                    ocf_log err "  3. If confirmed this node has latest timeline, manually override:"
                    ocf_log err "     - Stop cluster: crm configure property maintenance-mode=true"
                    ocf_log err "     - Start PostgreSQL manually, then: ALTER SYSTEM SET synchronous_standby_names = '';"
                    ocf_log err "     - Reload config: pg_ctl reload"
                    ocf_log err "     - Restart cluster: crm configure property maintenance-mode=false"
                    ocf_log err ""
                    return $OCF_ERR_CONFIGURED
                else
                    # Sync disabled (async mode) - SAFE to promote
                    ocf_log info "✓ Single-node startup with async replication (sync disabled)"
                    ocf_log info "✓ This is expected after standby failure (pgtwin notify support)"
                    ocf_log info "✓ Safe to start as primary"
                fi
            fi
        fi
    fi

    # Start PostgreSQL
    run_as_pguser sh -c "${PGCTL} -D ${PGDATA} -w -t 60 start" >/dev/null 2>&1
    rc=$?

    if [ $rc -ne 0 ]; then
        ocf_log err "Failed to start PostgreSQL"
        return $OCF_ERR_GENERIC
    fi

    # Wait for PostgreSQL to be ready
    local count=0
    while [ $count -lt 60 ]; do
        if run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c 'SELECT 1' >/dev/null 2>&1"; then
            ocf_log info "PostgreSQL started successfully"

            # v1.7.0: Update application_name and synchronous_standby_names using ALTER SYSTEM
            update_application_name_in_config

            # Perform configuration check after successful start
            check_postgresql_config
            rc=$?

            # Configuration check failures are warnings, not start failures
            # Return success even if config has warnings
            return $OCF_SUCCESS
        fi
        sleep 1
        count=$((count + 1))
    done

    ocf_log err "PostgreSQL started but not responding after 60 seconds"
    return $OCF_ERR_GENERIC
}

pgsql_stop() {
    local rc

    if ! pgsql_is_running; then
        ocf_log info "PostgreSQL is already stopped"
        return $OCF_SUCCESS
    fi

    ocf_log info "Stopping PostgreSQL"

    run_as_pguser sh -c "${PGCTL} -D ${PGDATA} -m fast stop" >/dev/null 2>&1
    rc=$?

    # Wait for stop
    local count=0
    while [ $count -lt 30 ]; do
        if ! pgsql_is_running; then
            ocf_log info "PostgreSQL stopped successfully"

            # Container mode: Stop and cleanup container
            if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
                if type pgtwin_container_stop >/dev/null 2>&1; then
                    ocf_log info "Stopping container"
                    pgtwin_container_stop
                fi
            fi

            return $OCF_SUCCESS
        fi
        sleep 1
        count=$((count + 1))
    done

    # Force stop if needed
    ocf_log warn "PostgreSQL did not stop gracefully, forcing stop"
    run_as_pguser sh -c "${PGCTL} -D ${PGDATA} -m immediate stop" >/dev/null 2>&1

    sleep 2

    if ! pgsql_is_running; then
        # Container mode: Stop and cleanup container
        if [ "${OCF_RESKEY_container_mode}" = "true" ]; then
            if type pgtwin_container_stop >/dev/null 2>&1; then
                ocf_log info "Stopping container"
                pgtwin_container_stop
            fi
        fi

        return $OCF_SUCCESS
    fi

    ocf_log err "Failed to stop PostgreSQL"
    return $OCF_ERR_GENERIC
}

check_replication_lag() {
    # Return replay lag in bytes from standby perspective
    # Returns 0 if already primary or if lag cannot be determined

    if pgsql_is_promoted; then
        echo "0"
        return 0
    fi

    local lag=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"
        SELECT COALESCE(
            pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()),
            0
        )
    \"" 2>/dev/null)

    if [ -z "$lag" ]; then
        echo "0"
        return 1
    fi

    echo "${lag}"
    return 0
}

check_promotion_safety() {
    # Check if it's safe to promote this standby node
    # Returns: 0 = safe to promote
    #          1 = not safe (not caught up)
    #          2 = forced (safety checks bypassed)

    local force_promotion=${OCF_RESKEY_force_promotion:-false}

    if [ "$force_promotion" = "true" ]; then
        ocf_log warn "force_promotion=true - BYPASSING ALL SAFETY CHECKS (DANGEROUS!)"
        ocf_log warn "This may result in data loss if standby is not caught up"
        return 2  # Forced
    fi

    # If already primary, always safe
    if pgsql_is_promoted; then
        return 0
    fi

    # SAFETY CHECK v1.6.14: Single-node promotion guard
    # If we're alone in the cluster with sync replication configured,
    # this indicates a double-failure scenario where we should NOT promote
    if command -v crm_node >/dev/null 2>&1; then
        local node_count=$(crm_node -l 2>/dev/null | grep -c member)

        if [ "$node_count" -eq 1 ]; then
            # We're alone - check if sync replication is configured
            # Use postgres -C to query config (works with any config file structure)
            local sync_standby=$(get_pg_config_value "synchronous_standby_names")

            if [ -n "$sync_standby" ]; then
                # Sync replication configured but we're alone - likely double failure
                ocf_log err "⚠️  REFUSING PROMOTION: Single node with sync replication ⚠️"
                ocf_log err "synchronous_standby_names='${sync_standby}'"
                ocf_log err ""
                ocf_log err "This suggests a double-failure scenario where:"
                ocf_log err "  - Both nodes crashed"
                ocf_log err "  - This node (former standby) booted first"
                ocf_log err "  - Other node (former primary) may have newer data"
                ocf_log err ""
                ocf_log err "If this node was the primary when standby failed, pgtwin"
                ocf_log err "would have disabled sync replication. Since sync is still"
                ocf_log err "enabled, this was likely the STANDBY node."
                ocf_log err ""
                ocf_log err "REFUSING promotion to prevent data loss!"
                ocf_log err ""
                ocf_log err "Recovery options:"
                ocf_log err "  1. WAIT for other node (former primary) to join (RECOMMENDED)"
                ocf_log err "  2. Check timelines on both nodes to confirm which has latest data"
                ocf_log err "  3. Only if confirmed this node has latest data:"
                ocf_log err "     - Stop cluster: crm configure property maintenance-mode=true"
                ocf_log err "     - Remove: rm ${PGDATA}/standby.signal"
                ocf_log err "     - Disable sync and restart cluster"
                ocf_log err ""
                return 1
            fi
        fi
    fi

    # Check if we're in sync mode
    if [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
        ocf_log info "Promotion safety check: rep_mode=sync - checking synchronous replication state"

        # Discover current primary to query sync state
        local primary_host=$(discover_promoted_node)
        if [ -z "$primary_host" ]; then
            ocf_log warn "Cannot discover primary node to check sync state"
            ocf_log warn "Allowing promotion (fail-open for cluster recovery scenarios)"
            return 0
        fi

        # Get our application_name
        local app_name
        get_application_name
        app_name="$APPLICATION_NAME"
        ocf_log info "Checking if application_name='${app_name}' is synchronized on primary ${primary_host}"

        # Query primary: Is this standby in sync state?
        local sync_state=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -Atc \"
            SELECT sync_state
            FROM pg_stat_replication
            WHERE application_name='${app_name}'
                AND state='streaming'
        \"" 2>/dev/null)

        if [ "$sync_state" = "sync" ]; then
            ocf_log info "✅ Promotion safety check PASSED: Node has sync_state='sync' (PostgreSQL guarantees caught up)"
            return 0
        elif [ "$sync_state" = "potential" ]; then
            ocf_log warn "❌ Promotion safety check FAILED: Node has sync_state='potential' (still catching up to sync)"
            ocf_log warn "Standby is connected and streaming but not yet fully synchronized"
            ocf_log warn "Blocking promotion - will retry on next monitor cycle"
            ocf_log warn "To force promotion anyway (DANGEROUS), set force_promotion=true"
            return 1
        elif [ "$sync_state" = "async" ]; then
            ocf_log warn "❌ Promotion safety check FAILED: Node has sync_state='async' (not in synchronous replication)"
            ocf_log warn "This can happen during startup while standby is catching up"
            ocf_log warn "Blocking promotion until synchronous replication is established"
            return 1
        elif [ -z "$sync_state" ]; then
            ocf_log warn "❌ Promotion safety check FAILED: Node not found in pg_stat_replication on primary"
            ocf_log warn "Standby may not be connected or replication may not be active"
            ocf_log warn "Blocking promotion - check replication connectivity"
            return 1
        else
            ocf_log warn "❌ Promotion safety check FAILED: Unknown sync_state='${sync_state}'"
            return 1
        fi
    else
        # Async mode - check lag
        ocf_log info "Promotion safety check: rep_mode=async - checking replication lag"

        local max_lag=${OCF_RESKEY_max_promotion_lag_bytes:-10485760}  # 10MB default

        if [ "$max_lag" -eq -1 ]; then
            ocf_log info "max_promotion_lag_bytes=-1 - skipping lag check (unsafe but requested)"
            return 0
        fi

        local lag_bytes=$(check_replication_lag)
        local lag_check_rc=$?

        if [ $lag_check_rc -ne 0 ]; then
            ocf_log warn "Could not determine replication lag - allowing promotion (fail-open)"
            return 0
        fi

        if [ "$lag_bytes" -le "$max_lag" ]; then
            ocf_log info "✅ Promotion safety check PASSED: Replication lag ${lag_bytes} bytes <= threshold ${max_lag} bytes"
            return 0
        else
            ocf_log warn "❌ Promotion safety check FAILED: Replication lag ${lag_bytes} bytes > threshold ${max_lag} bytes"
            ocf_log warn "Standby is not caught up yet - blocking promotion"
            ocf_log warn "Current lag: $(numfmt --to=iec ${lag_bytes} 2>/dev/null || echo ${lag_bytes}) bytes"
            ocf_log warn "Threshold: $(numfmt --to=iec ${max_lag} 2>/dev/null || echo ${max_lag}) bytes"
            ocf_log warn "Wait for standby to catch up or increase max_promotion_lag_bytes"
            ocf_log warn "To force promotion anyway (DANGEROUS), set force_promotion=true"
            return 1
        fi
    fi
}

pgsql_promote() {
    local rc

    if pgsql_is_promoted; then
        ocf_log info "PostgreSQL is already promoted"
        return $OCF_SUCCESS
    fi

    if ! pgsql_is_running; then
        ocf_log err "PostgreSQL is not running, cannot promote"
        return $OCF_ERR_GENERIC
    fi

    # NEW v1.6.8: Check promotion safety before allowing promotion
    check_promotion_safety
    local safety_rc=$?

    if [ $safety_rc -eq 1 ]; then
        ocf_log err "Promotion safety check failed - node is not ready for promotion"
        ocf_log err "Standby is still catching up or not in sync state"
        ocf_log err "Pacemaker will retry promotion on next monitor cycle once node is caught up"
        ocf_log err "To force immediate promotion (DANGEROUS), set force_promotion=true"
        return $OCF_ERR_GENERIC
    elif [ $safety_rc -eq 2 ]; then
        ocf_log warn "⚠️  FORCED PROMOTION - Safety checks bypassed! This may cause data loss!"
    fi

    ocf_log info "Promoting PostgreSQL to primary"

    # Promote using pg_ctl (it will remove standby.signal automatically)
    # Note: Do NOT manually remove standby.signal - pg_ctl promote handles it
    run_as_pguser sh -c "${PGCTL} -D ${PGDATA} promote" >/dev/null 2>&1
    rc=$?

    if [ $rc -ne 0 ]; then
        ocf_log err "Failed to promote PostgreSQL (rc=$rc)"
        # Check if standby.signal still exists (shouldn't happen but handle edge case)
        if [ -f "${PGDATA}/standby.signal" ]; then
            ocf_log info "standby.signal still exists, removing it manually"
            rm -f "${PGDATA}/standby.signal"
        fi
        return $OCF_ERR_GENERIC
    fi

    # Wait for promotion
    local count=0
    while [ $count -lt 30 ]; do
        if pgsql_is_promoted; then
            ocf_log info "PostgreSQL promoted successfully"

            # v1.7.0: Update application_name and synchronous_standby_names using ALTER SYSTEM
            update_application_name_in_config

            # Create replication slot if needed
            if [ -n "${OCF_RESKEY_slot_name}" ]; then
                create_replication_slot
            fi

            return $OCF_SUCCESS
        fi
        sleep 1
        count=$((count + 1))
    done

    ocf_log err "PostgreSQL promotion timed out"
    return $OCF_ERR_GENERIC
}

check_timeline_divergence() {
    # Check if local PGDATA has diverged from the primary
    # Returns 0 if diverged (needs recovery), 1 if not diverged
    local primary_host="$1"
    local rep_user=$(get_replication_user)

    if [ ! -d "${PGDATA}" ]; then
        ocf_log debug "PGDATA does not exist, no divergence possible"
        return 1
    fi

    # Get local checkpoint LSN and timeline from pg_controldata
    local local_checkpoint_lsn=""
    local local_timeline=""

    if command -v pg_controldata >/dev/null 2>&1; then
        local_checkpoint_lsn=$(run_as_pguser pg_controldata "${PGDATA}" 2>/dev/null | grep "Latest checkpoint location:" | awk '{print $NF}')
        local_timeline=$(run_as_pguser pg_controldata "${PGDATA}" 2>/dev/null | grep "Latest checkpoint's TimeLineID:" | awk '{print $NF}')
    fi

    if [ -z "$local_checkpoint_lsn" ] || [ -z "$local_timeline" ]; then
        ocf_log debug "Could not read local pg_controldata, assuming no divergence"
        return 1
    fi

    ocf_log info "Local PGDATA state: timeline=${local_timeline}, checkpoint_lsn=${local_checkpoint_lsn}"

    # Try to get primary's current WAL position
    local primary_current_lsn=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc \"SELECT pg_current_wal_lsn()\"" 2>/dev/null)
    local primary_timeline=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc \"SELECT timeline_id FROM pg_control_checkpoint()\"" 2>/dev/null)

    if [ -z "$primary_current_lsn" ] || [ -z "$primary_timeline" ]; then
        ocf_log warn "Could not query primary for timeline/LSN, will attempt demotion and let PostgreSQL detect issues"
        return 1
    fi

    ocf_log info "Primary state: timeline=${primary_timeline}, current_lsn=${primary_current_lsn}"

    # Check if timelines match
    if [ "$local_timeline" != "$primary_timeline" ]; then
        ocf_log warn "Timeline divergence detected: local=${local_timeline}, primary=${primary_timeline}"
        ocf_log info "Timeline divergence requires pg_rewind recovery"
        return 0
    fi

    # Check if local checkpoint is ahead of primary's current position
    # This indicates the local node has WAL that the primary doesn't have
    local lsn_compare=$(run_as_pguser sh -c "${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc \"SELECT pg_wal_lsn_diff('${local_checkpoint_lsn}', '${primary_current_lsn}')\"" 2>/dev/null)

    if [ -n "$lsn_compare" ] && [ "$lsn_compare" -gt 0 ]; then
        ocf_log warn "WAL divergence detected: local checkpoint ${local_checkpoint_lsn} is ahead of primary ${primary_current_lsn} by $lsn_compare bytes"
        ocf_log info "WAL divergence requires pg_rewind recovery"
        return 0
    fi

    ocf_log debug "No timeline divergence detected"
    return 1
}

pgsql_demote() {
    local rc
    local primary_host=""
    local rep_user
    local app_name

    rep_user=$(get_replication_user)

    get_application_name
    app_name="$APPLICATION_NAME"

    if ! pgsql_is_promoted; then
        ocf_log info "PostgreSQL is already demoted"
        return $OCF_SUCCESS
    fi

    ocf_log info "Demoting PostgreSQL to standby"

    # Enhanced promoted node discovery (v1.6)
    # Try dynamic discovery first, then fall back to traditional methods
    ocf_log info "Discovering current promoted node for replication setup"
    primary_host=$(discover_promoted_node)

    if [ -z "$primary_host" ]; then
        # Fallback to .pgpass or node list
        ocf_log info "Dynamic discovery failed, trying traditional methods"
        primary_host=$(get_replication_host)

        if [ -z "$primary_host" ]; then
            # Fallback to node list (first non-self node)
            for node in ${OCF_RESKEY_node_list}; do
                if [ "$node" != "$(get_cluster_node_name)" ]; then
                    primary_host="$node"
                    break
                fi
            done
        fi
    fi

    if [ -z "$primary_host" ]; then
        ocf_log err "Cannot determine primary host for replication"
        ocf_log err "Tried: discover_promoted_node, .pgpass, and node_list"
        return $OCF_ERR_GENERIC
    fi

    ocf_log info "Will demote to standby replicating from ${primary_host} as user '${rep_user}'"

    # Stop PostgreSQL
    pgsql_stop
    rc=$?
    if [ $rc -ne 0 ]; then
        return $rc
    fi

    # v1.7.0: Check for timeline divergence before attempting to start as standby
    ocf_log info "Checking for timeline divergence before demotion"
    if check_timeline_divergence "${primary_host}"; then
        ocf_log info "Timeline divergence detected, triggering pg_rewind recovery"

        # Call recover_standby which will handle pg_rewind, create standby.signal, and start
        recover_standby "${primary_host}"
        rc=$?

        if [ $rc -eq 0 ]; then
            ocf_log info "PostgreSQL demoted successfully via pg_rewind recovery"
            return $OCF_SUCCESS
        else
            ocf_log err "Failed to recover standby during demotion (rc=$rc)"
            return $rc
        fi
    fi

    # No divergence detected, proceed with normal demotion
    ocf_log info "No timeline divergence detected, proceeding with normal demotion"

    # Create standby.signal
    run_as_pguser touch "${PGDATA}/standby.signal"

    # Update postgresql.auto.conf for replication with validated application name
    run_as_pguser sh -c "cat > ${PGDATA}/postgresql.auto.conf" <<EOF
primary_conninfo = 'host=${primary_host} port=${OCF_RESKEY_pgport} user=${rep_user} application_name=${app_name} sslmode=prefer passfile=${OCF_RESKEY_pgpassfile}'
primary_slot_name = '${OCF_RESKEY_slot_name}'
EOF

    # Start as standby
    pgsql_start
    rc=$?

    if [ $rc -ne 0 ]; then
        ocf_log err "Failed to start PostgreSQL as standby after demotion"
        return $rc
    fi

    ocf_log info "PostgreSQL demoted successfully"
    return $OCF_SUCCESS
}

create_replication_slot() {
    ocf_log info "Creating replication slot ${OCF_RESKEY_slot_name}"

    # Check if slot already exists
    local slot_exists=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT count(*) FROM pg_replication_slots WHERE slot_name='${OCF_RESKEY_slot_name}'\"" 2>/dev/null)

    if [ "$slot_exists" = "1" ]; then
        ocf_log info "Replication slot ${OCF_RESKEY_slot_name} already exists"
        return 0
    fi

    # Create the slot
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"SELECT pg_create_physical_replication_slot('${OCF_RESKEY_slot_name}')\"" >/dev/null 2>&1

    if [ $? -eq 0 ]; then
        ocf_log info "Replication slot ${OCF_RESKEY_slot_name} created successfully"
        return 0
    else
        ocf_log warn "Failed to create replication slot ${OCF_RESKEY_slot_name}"
        return 1
    fi
}

# Finalize standby configuration after pg_rewind or pg_basebackup
# This ensures postgresql.auto.conf has correct primary_conninfo with:
# - Correct primary host
# - Correct replication user
# - Correct application_name (pgtwin-managed)
# - Correct passfile location
# This runs AFTER pg_basebackup -R to sanitize/override the generated config
finalize_standby_config() {
    local primary_host="$1"
    local rep_user="$2"
    local app_name
    local pgpass_file

    get_application_name
    app_name="$APPLICATION_NAME"

    ensure_pgpass
    pgpass_file="$PGPASS_FILE"

    if [ -z "$primary_host" ] || [ -z "$rep_user" ]; then
        ocf_log err "finalize_standby_config: Missing required parameters (host='${primary_host}', user='${rep_user}')"
        return 1
    fi

    if [ $? -ne 0 ]; then
        ocf_log err "finalize_standby_config: Cannot ensure .pgpass file"
        return 1
    fi

    ocf_log info "Finalizing standby configuration: primary=${primary_host}, user=${rep_user}, app_name=${app_name}"

    # Check if PostgreSQL is running
    if pgsql_is_running; then
        ocf_log warn "PostgreSQL is running during finalization - using ALTER SYSTEM (prefer to avoid this)"

        # Use ALTER SYSTEM if PostgreSQL is already running
        run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"ALTER SYSTEM SET primary_conninfo = 'host=${primary_host} port=${OCF_RESKEY_pgport} user=${rep_user} application_name=${app_name} sslmode=prefer passfile=${pgpass_file}'\"" >/dev/null 2>&1

        if [ $? -ne 0 ]; then
            ocf_log err "Failed to set primary_conninfo via ALTER SYSTEM"
            return 1
        fi

        run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"ALTER SYSTEM SET primary_slot_name = '${OCF_RESKEY_slot_name}'\"" >/dev/null 2>&1

        if [ $? -ne 0 ]; then
            ocf_log err "Failed to set primary_slot_name via ALTER SYSTEM"
            return 1
        fi

        # Reload configuration
        run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c 'SELECT pg_reload_conf()'" >/dev/null 2>&1

        ocf_log info "Standby configuration updated via ALTER SYSTEM (requires PostgreSQL restart to take effect)"
    else
        ocf_log info "PostgreSQL is stopped - directly updating postgresql.auto.conf (preferred method)"

        # Preferred method: Direct file update when PostgreSQL is stopped
        # This overwrites pg_basebackup -R generated config to ensure correctness
        run_as_pguser sh -c "cat > ${PGDATA}/postgresql.auto.conf" <<EOF
# Do not edit this file manually!
# It will be overwritten by the ALTER SYSTEM command.
primary_conninfo = 'host=${primary_host} port=${OCF_RESKEY_pgport} user=${rep_user} application_name=${app_name} sslmode=prefer passfile=${pgpass_file}'
primary_slot_name = '${OCF_RESKEY_slot_name}'
EOF

        if [ $? -ne 0 ]; then
            ocf_log err "Failed to write postgresql.auto.conf"
            return 1
        fi

        chmod 600 "${PGDATA}/postgresql.auto.conf"

        ocf_log info "postgresql.auto.conf updated successfully"
    fi

    # Always ensure standby.signal exists
    # pg_basebackup -R should create this, but be defensive
    if [ ! -f "${PGDATA}/standby.signal" ]; then
        ocf_log info "Creating missing standby.signal"
        run_as_pguser touch "${PGDATA}/standby.signal"
    fi

    ocf_log info "Standby configuration finalized successfully"
    return 0
}

recover_standby() {
    local primary_host="$1"
    local rc
    local rep_user
    local app_name

    rep_user=$(get_replication_user)

    get_application_name
    app_name="$APPLICATION_NAME"

    ocf_log info "Attempting to recover standby from primary ${primary_host} using replication user '${rep_user}'"

    # Stop PostgreSQL if running
    if pgsql_is_running; then
        pgsql_stop
    fi

    # Try pg_rewind first
    if [ -x "${PG_REWIND}" ]; then
        ocf_log info "Attempting pg_rewind"

        # Ensure .pgpass is in standard location
        ensure_pgpass
        if [ $? -ne 0 ]; then
            ocf_log warn "Cannot ensure .pgpass file, skipping pg_rewind"
        else
            local pgpass_file="$PGPASS_FILE"
            ocf_log info "Using .pgpass file: ${pgpass_file}"

            # Test connectivity before attempting pg_rewind
            ocf_log debug "Testing connectivity to ${primary_host} as ${rep_user}"
            run_as_pguser env PGPASSFILE="${pgpass_file}" psql -h "${primary_host}" -p "${OCF_RESKEY_pgport}" -U "${rep_user}" -d postgres -Atc "SELECT 1" >/dev/null 2>&1
            if [ $? -ne 0 ]; then
                ocf_log warn "Cannot connect to primary ${primary_host} as ${rep_user}, pg_rewind will likely fail"
            fi

            # Run pg_rewind with proper environment variable passing
            ocf_log info "Executing pg_rewind to sync from ${primary_host}"
            run_as_pguser env PGPASSFILE="${pgpass_file}" \
                ${PG_REWIND} \
                --target-pgdata="${PGDATA}" \
                --source-server="host=${primary_host} port=${OCF_RESKEY_pgport} user=${rep_user} dbname=postgres sslmode=prefer" \
                --progress 2>&1 | tee /tmp/pg_rewind_$$.log
            rc=${PIPESTATUS[0]}

            if [ $rc -eq 0 ]; then
                ocf_log info "pg_rewind completed successfully"
                rm -f /tmp/pg_rewind_$$.log

                # Finalize standby configuration (creates standby.signal and postgresql.auto.conf)
                finalize_standby_config "${primary_host}" "${rep_user}"
                if [ $? -ne 0 ]; then
                    ocf_log err "Failed to finalize standby configuration after pg_rewind"
                    return $OCF_ERR_GENERIC
                fi

                return 0
            else
                ocf_log warn "pg_rewind failed with exit code $rc, see log: /tmp/pg_rewind_$$.log"
                ocf_log warn "pg_rewind output: $(cat /tmp/pg_rewind_$$.log | head -20)"
                ocf_log warn "Falling back to full basebackup"
            fi
        fi
    fi

    # Check if async basebackup is already in progress
    # BUG FIX v1.6.6: Check state directory, not PGDATA
    local state_dir=$(dirname "${PGDATA}")
    if [ -f "${state_dir}/.pgtwin_basebackup_in_progress" ]; then
        check_basebackup_progress
        return $?
    fi

    # Check disk space before proceeding
    if ! check_disk_space_for_basebackup "${primary_host}"; then
        ocf_log err "Aborting basebackup due to insufficient disk space"
        return $OCF_ERR_GENERIC
    fi

    # Fallback to pg_basebackup
    ocf_log info "Performing full basebackup with pg_basebackup (asynchronous)"

    # Handle existing data directory based on backup_before_basebackup setting
    if [ "${OCF_RESKEY_backup_before_basebackup}" = "true" ]; then
        # BACKUP MODE: Preserve existing data in timestamped backup directory
        local backup_dir="${PGDATA}.backup.$(date +%s)"
        ocf_log info "Backup mode ENABLED: Moving ${PGDATA} to ${backup_dir}"

        # Clean up any old .old directories first
        if [ -d "${PGDATA}.old" ]; then
            ocf_log info "Removing old temporary directory ${PGDATA}.old"
            rm -rf "${PGDATA}.old"
        fi

        if [ -d "${PGDATA}" ]; then
            mv "${PGDATA}" "${backup_dir}"
            ocf_log info "Existing data PRESERVED in ${backup_dir}"
        fi
    else
        # NO-BACKUP MODE: Delete existing data immediately (no .old directory)
        ocf_log warn "Backup mode DISABLED: Deleting existing data directory ${PGDATA} permanently"

        if [ -d "${PGDATA}" ]; then
            # Delete immediately without creating .old
            rm -rf "${PGDATA}"
            ocf_log warn "Existing data DELETED (no backup created)"
        fi

        # Also clean up any leftover .old directories
        if [ -d "${PGDATA}.old" ]; then
            ocf_log info "Removing leftover ${PGDATA}.old directory"
            rm -rf "${PGDATA}.old"
        fi
    fi

    # Create fresh PGDATA directory
    mkdir -p "${PGDATA}"
    chown ${OCF_RESKEY_pguser}:${OCF_RESKEY_pguser} "${PGDATA}"
    chmod 750 "${PGDATA}"
    ocf_log info "Created fresh PGDATA directory at ${PGDATA} with permissions 750"

    # Start asynchronous pg_basebackup
    start_async_basebackup "${primary_host}" "${rep_user}"
    return $?
}

# Start pg_basebackup in background
start_async_basebackup() {
    local primary_host="$1"
    local rep_user="$2"

    # BUG FIX v1.6.6: Store tracking files OUTSIDE of PGDATA
    # pg_basebackup requires PGDATA to be empty, so we can't put files there
    # Use parent directory (e.g., /var/lib/pgsql/) for state files
    local state_dir=$(dirname "${PGDATA}")
    local log_file="${state_dir}/.pgtwin_basebackup.log"
    local pid_file="${state_dir}/.pgtwin_basebackup_in_progress"
    local rc_file="${state_dir}/.pgtwin_basebackup_rc"

    ocf_log info "Starting asynchronous pg_basebackup from ${primary_host}"

    # Ensure .pgpass is in standard location
    ensure_pgpass
    if [ $? -ne 0 ]; then
        ocf_log err "Cannot ensure .pgpass file for pg_basebackup"
        return $OCF_ERR_CONFIGURED
    fi
    local pgpass_file="$PGPASS_FILE"
    ocf_log info "Using .pgpass file for pg_basebackup: ${pgpass_file}"

    # BUG FIX v1.6.10: Create replication slot BEFORE pg_basebackup
    # This prevents WAL recycling race condition between basebackup completion
    # and standby startup (can be minutes/hours with large DBs or slow startups)
    if [ -n "${OCF_RESKEY_slot_name}" ]; then
        ocf_log info "Ensuring replication slot '${OCF_RESKEY_slot_name}' exists on primary ${primary_host}"

        # Idempotent slot creation: creates only if doesn't exist, using single SQL query
        # Store SQL in variable and execute via psql, writing result to temp file to avoid command substitution
        local slot_sql="SELECT CASE WHEN (SELECT count(*) FROM pg_replication_slots WHERE slot_name = '${OCF_RESKEY_slot_name}') = 0 THEN (SELECT pg_create_physical_replication_slot('${OCF_RESKEY_slot_name}', true))::text ELSE 'already_exists' END"
        local slot_result_file="${state_dir}/.pgtwin_slot_result.$$"

        # Note: PGPASSFILE env var doesn't work with runuser, but psql automatically uses ~/.pgpass
        # ensure_pgpass() already ensures the file is at /var/lib/pgsql/.pgpass
        run_as_pguser ${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc "${slot_sql}" > "${slot_result_file}" 2>&1
        local slot_rc=$?
        local slot_result
        read slot_result < "${slot_result_file}" 2>/dev/null || slot_result=""
        rm -f "${slot_result_file}"

        if [ $slot_rc -eq 0 ]; then
            if [ "$slot_result" = "already_exists" ]; then
                ocf_log info "Replication slot '${OCF_RESKEY_slot_name}' already exists on primary (reusing it)"
            else
                ocf_log info "Replication slot '${OCF_RESKEY_slot_name}' created successfully on primary"
                ocf_log info "WAL is now protected from recycling during basebackup and standby startup"
            fi
        else
            ocf_log err "Failed to ensure replication slot exists: ${slot_result}"
            ocf_log err "pg_basebackup will likely fail due to missing slot"
            return $OCF_ERR_GENERIC
        fi
    fi

    # ENHANCEMENT v1.6.14: Verify PostgreSQL version compatibility before basebackup
    # This prevents cross-cluster replication with different major versions
    ocf_log info "Verifying PostgreSQL version compatibility between local and primary"

    # Get local PostgreSQL binary major version
    local local_version_output="${state_dir}/.pgtwin_local_version.$$"
    ${PG_CTL} --version > "${local_version_output}" 2>&1
    local local_version_full
    read local_version_full < "${local_version_output}" 2>/dev/null || local_version_full=""
    rm -f "${local_version_output}"

    # Extract major version (e.g., "pg_ctl (PostgreSQL) 18.1" -> "18")
    local local_pg_major=$(echo "$local_version_full" | grep -oE '[0-9]+\.[0-9]+' | head -1 | cut -d. -f1)

    if [ -z "$local_pg_major" ]; then
        ocf_log warn "Could not determine local PostgreSQL major version from: ${local_version_full}"
        ocf_log warn "Proceeding with pg_basebackup without version check"
    else
        ocf_log debug "Local PostgreSQL binary major version: ${local_pg_major}"

        # Get primary server version
        local primary_version_output="${state_dir}/.pgtwin_primary_version.$$"
        run_as_pguser ${PSQL} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -d postgres -Atc "SHOW server_version" > "${primary_version_output}" 2>&1
        local primary_version_rc=$?
        local primary_version_full
        read primary_version_full < "${primary_version_output}" 2>/dev/null || primary_version_full=""
        rm -f "${primary_version_output}"

        if [ $primary_version_rc -ne 0 ]; then
            ocf_log warn "Could not query primary server version: ${primary_version_full}"
            ocf_log warn "Proceeding with pg_basebackup without version check"
        else
            # Extract major version (e.g., "18.1 on x86_64..." -> "18")
            local primary_pg_major=$(echo "$primary_version_full" | grep -oE '[0-9]+\.[0-9]+' | head -1 | cut -d. -f1)

            if [ -z "$primary_pg_major" ]; then
                ocf_log warn "Could not parse primary PostgreSQL major version from: ${primary_version_full}"
                ocf_log warn "Proceeding with pg_basebackup without version check"
            else
                ocf_log debug "Primary PostgreSQL server major version: ${primary_pg_major}"

                # Compare major versions
                if [ "$local_pg_major" != "$primary_pg_major" ]; then
                    ocf_log err "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓"
                    ocf_log err "┃ CRITICAL: PostgreSQL version mismatch detected!            ┃"
                    ocf_log err "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛"
                    ocf_log err ""
                    ocf_log err "Local binary major version:  PostgreSQL ${local_pg_major}"
                    ocf_log err "Primary server major version: PostgreSQL ${primary_pg_major}"
                    ocf_log err "Primary host: ${primary_host}"
                    ocf_log err ""
                    ocf_log err "Cannot replicate between different PostgreSQL major versions."
                    ocf_log err "This likely indicates one of the following:"
                    ocf_log err "  1. Wrong cluster configuration (check node_list parameter)"
                    ocf_log err "  2. Wrong primary node discovered"
                    ocf_log err "  3. Cross-cluster replication attempt (multiple clusters on same nodes)"
                    ocf_log err ""
                    ocf_log err "Please verify:"
                    ocf_log err "  - node_list='${OCF_RESKEY_node_list}'"
                    ocf_log err "  - Primary host '${primary_host}' is in the correct cluster"
                    ocf_log err "  - /var/lib/pgsql/.pgpass contains entries only for this cluster's nodes"
                    ocf_log err ""
                    ocf_log err "pg_basebackup will NOT proceed."
                    return $OCF_ERR_CONFIGURED
                fi

                ocf_log info "✓ Version compatibility verified: PostgreSQL ${local_pg_major} (local) = PostgreSQL ${primary_pg_major} (primary)"
            fi
        fi
    fi

    # Create progress marker with correct ownership
    cat > "${pid_file}" <<EOF
started=$(date +%s)
primary=${primary_host}
user=${rep_user}
timeout=${OCF_RESKEY_basebackup_timeout}
pgdata=${PGDATA}
EOF
    # Get group name without command substitution
    id -gn ${OCF_RESKEY_pguser} > /tmp/.pgtwin_pidfile_group.$$
    local pidfile_group
    read pidfile_group < /tmp/.pgtwin_pidfile_group.$$
    rm -f /tmp/.pgtwin_pidfile_group.$$
    chown ${OCF_RESKEY_pguser}:${pidfile_group} "${pid_file}"

    # BUG FIX v1.6.17: Pre-create log file with correct ownership
    # Without this, shell redirection creates file as root
    cat > "${log_file}" <<EOF
========================================
pg_basebackup started: $(date '+%Y-%m-%d %H:%M:%S')
Primary host: ${primary_host}
Replication user: ${rep_user}
Target PGDATA: ${PGDATA}
========================================

EOF
    chown ${OCF_RESKEY_pguser}:${pidfile_group} "${log_file}"

    # BUG FIX v1.6.17: Pre-create rc_file with correct ownership
    # The background process will overwrite this with the actual return code
    echo "" > "${rc_file}"
    chown ${OCF_RESKEY_pguser}:${pidfile_group} "${rc_file}"

    # Get current cluster node name for cleanup trigger
    local node_name=$(crm_node -n 2>/dev/null)
    if [ -z "$node_name" ]; then
        ocf_log warn "Could not determine cluster node name, cleanup trigger may not work"
        node_name=$(hostname)
    fi

    # Get UID/GID for background process (run_as_pguser function won't be available in subshell)
    local pguid=$(id -u "${OCF_RESKEY_pguser}")
    local pggid=$(id -g "${OCF_RESKEY_pguser}")

    # Detect if tracing is enabled in parent shell
    local trace_enabled="false"
    case "$-" in
        *x*) trace_enabled="true" ;;
    esac

    # Start pg_basebackup in background with nohup for SIGHUP protection
    # Note: PGPASSFILE env var doesn't work with runuser, but psql/pg_basebackup automatically use ~/.pgpass
    # All output (including shell trace) goes to single log file
    nohup sh -c "
        # Enable tracing if parent agent had it enabled
        if [ '${trace_enabled}' = 'true' ]; then
            set -x
            echo '=========================================='
            echo 'DEBUG: Background process tracing enabled'
            echo 'DEBUG: Process PID: \$\$'
            echo '=========================================='
        fi

        setpriv --reuid=${pguid} --regid=${pggid} --clear-groups env PGSSLMODE=prefer ${PG_BASEBACKUP} -h ${primary_host} -p ${OCF_RESKEY_pgport} -U ${rep_user} -D ${PGDATA} -X stream -P -R -S ${OCF_RESKEY_slot_name} --progress
        basebackup_rc=\$?
        echo \${basebackup_rc} > '${rc_file}'

        # NEW v1.6.8: Trigger automatic cleanup after basebackup completion
        # This eliminates the 5 minute failure-timeout wait for small databases
        # If cleanup fails, the existing failure-timeout mechanism still applies
        if [ \${basebackup_rc} -eq 0 ]; then
            echo \"\$(date '+%Y-%m-%d %H:%M:%S') Basebackup completed successfully, triggering resource cleanup\"
        else
            echo \"\$(date '+%Y-%m-%d %H:%M:%S') Basebackup failed with exit code \${basebackup_rc}, triggering resource cleanup\"
        fi

        # Trigger cleanup to allow immediate restart (instead of waiting 5 minutes)
        # Use full path to ensure command is found, and suppress errors if cleanup fails
        /usr/sbin/crm_resource --resource '${OCF_RESOURCE_INSTANCE}' --node '${node_name}' --cleanup
        cleanup_rc=\$?

        if [ \${cleanup_rc} -eq 0 ]; then
            echo \"\$(date '+%Y-%m-%d %H:%M:%S') Resource cleanup triggered successfully\"
        else
            echo \"\$(date '+%Y-%m-%d %H:%M:%S') Resource cleanup failed (rc=\${cleanup_rc}), relying on failure-timeout\"
        fi
    " >> "${log_file}" 2>&1 &

    local bg_pid=$!
    echo "pid=${bg_pid}" >> "${pid_file}"

    ocf_log info "pg_basebackup started in background with PID ${bg_pid} (SIGHUP protected via nohup)"
    ocf_log info "Automatic cleanup will be triggered when basebackup completes (new in v1.6.8)"
    if [ "${trace_enabled}" = "true" ]; then
        ocf_log info "Background process tracing enabled - see ${log_file} for detailed execution"
    fi

    # Return success for now, monitor will check progress
    return $OCF_SUCCESS
}

# Check progress of asynchronous basebackup
check_basebackup_progress() {
    # BUG FIX v1.6.6: Use same state directory as start_async_basebackup()
    local state_dir=$(dirname "${PGDATA}")
    local pid_file="${state_dir}/.pgtwin_basebackup_in_progress"
    local rc_file="${state_dir}/.pgtwin_basebackup_rc"
    local log_file="${state_dir}/.pgtwin_basebackup.log"

    if [ ! -f "${pid_file}" ]; then
        return 0  # No basebackup in progress
    fi

    # Read PID and start time
    local bg_pid=$(grep "^pid=" "${pid_file}" | cut -d= -f2)
    local started=$(grep "^started=" "${pid_file}" | cut -d= -f2)
    local timeout=$(grep "^timeout=" "${pid_file}" | cut -d= -f2)
    local elapsed=$(($(date +%s) - started))

    # Check if process is still running
    if [ -n "$bg_pid" ] && kill -0 "$bg_pid" 2>/dev/null; then
        # Still running
        if [ "$elapsed" -gt "$timeout" ]; then
            ocf_log err "pg_basebackup timeout after ${elapsed}s (limit: ${timeout}s), killing process"
            kill -9 "$bg_pid" 2>/dev/null
            rm -f "${pid_file}" "${rc_file}"

            # Only try to restore if backup mode was enabled
            if [ "${OCF_RESKEY_backup_before_basebackup}" = "true" ]; then
                # Look for timestamped backup to restore
                local latest_backup=$(ls -1td "${PGDATA}.backup."* 2>/dev/null | head -1)
                if [ -n "$latest_backup" ] && [ -d "$latest_backup" ]; then
                    ocf_log info "Restoring from backup: ${latest_backup}"
                    rm -rf "${PGDATA}"
                    mv "$latest_backup" "${PGDATA}"
                    ocf_log info "Data restored from backup after timeout"
                else
                    ocf_log err "No backup found to restore after timeout"
                fi
            else
                ocf_log warn "No backup mode - data lost after timeout (backup_before_basebackup=false)"
            fi

            return $OCF_ERR_GENERIC
        fi

        # Get progress from log if available
        local progress_line=$(tail -1 "${log_file}" 2>/dev/null | grep -o '[0-9]\+/[0-9]\+' | head -1)
        if [ -n "$progress_line" ]; then
            ocf_log info "pg_basebackup in progress: ${progress_line} (elapsed: ${elapsed}s)"
        else
            ocf_log info "pg_basebackup in progress (elapsed: ${elapsed}s)"
        fi

        return $OCF_SUCCESS
    fi

    # Process completed, check result
    if [ -f "${rc_file}" ]; then
        local bb_rc=$(cat "${rc_file}")

        # CRITICAL: Read values from pid_file BEFORE deleting it
        local primary_host=$(grep "^primary=" "${pid_file}" | cut -d= -f2)
        local rep_user=$(grep "^user=" "${pid_file}" | cut -d= -f2)

        # Now safe to delete
        rm -f "${pid_file}" "${rc_file}"

        if [ "$bb_rc" -eq 0 ]; then
            ocf_log info "Asynchronous pg_basebackup completed successfully after ${elapsed}s"

            # BUG FIX v1.6.6: Only finalize if PGDATA is actually valid
            # If PGDATA is empty/invalid, this means the tracking files are stale/orphaned
            # (e.g., user deleted PGDATA after basebackup completed)
            if ! is_valid_pgdata; then
                ocf_log warn "Basebackup tracking files exist but PGDATA is invalid - orphaned tracking files detected"
                ocf_log warn "This can happen if PGDATA was deleted after a successful basebackup"
                ocf_log info "Ignoring stale basebackup completion, will trigger new auto-initialization"
                # Don't finalize - let pgsql_start() detect empty PGDATA and run new basebackup
                return 0
            fi

            # PGDATA is valid, proceed with finalization
            # Clean up backup directories if backup mode was enabled
            if [ "${OCF_RESKEY_backup_before_basebackup}" = "true" ]; then
                # Optionally keep the most recent backup, remove older ones
                local backup_count=$(ls -1d "${PGDATA}.backup."* 2>/dev/null | wc -l)
                if [ "$backup_count" -gt 1 ]; then
                    ocf_log info "Cleaning up old backup directories (keeping most recent)"
                    ls -1td "${PGDATA}.backup."* | tail -n +2 | xargs rm -rf
                fi
            fi

            # Finalize standby configuration
            # This sanitizes pg_basebackup -R generated config with correct application_name and passfile
            finalize_standby_config "${primary_host}" "${rep_user}"
            if [ $? -ne 0 ]; then
                ocf_log err "Failed to finalize standby configuration after pg_basebackup"
                return $OCF_ERR_GENERIC
            fi

            return 0
        else
            ocf_log err "Asynchronous pg_basebackup failed with exit code ${bb_rc}"

            # Only try to restore if backup mode was enabled
            if [ "${OCF_RESKEY_backup_before_basebackup}" = "true" ]; then
                # Look for most recent backup to restore
                local latest_backup=$(ls -1td "${PGDATA}.backup."* 2>/dev/null | head -1)
                if [ -n "$latest_backup" ] && [ -d "$latest_backup" ]; then
                    ocf_log info "Restoring from backup after basebackup failure: ${latest_backup}"
                    rm -rf "${PGDATA}"
                    mv "$latest_backup" "${PGDATA}"
                    ocf_log info "Data restored from backup"
                else
                    ocf_log err "No backup found to restore after failure"
                fi
            else
                ocf_log warn "Basebackup failed and no backup mode - data may be lost (backup_before_basebackup=false)"
            fi

            return $OCF_ERR_GENERIC
        fi
    fi

    # Shouldn't get here, but cleanup just in case
    rm -f "${pid_file}"
    return $OCF_ERR_GENERIC
}

disable_sync_replication() {
    ocf_log info "Disabling synchronous replication due to standby failure"

    # Update postgresql.auto.conf to disable sync replication
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"ALTER SYSTEM SET synchronous_standby_names = ''\"" >/dev/null 2>&1
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"SELECT pg_reload_conf()\"" >/dev/null 2>&1

    ocf_log info "Synchronous replication disabled"
}

enable_sync_replication() {
    ocf_log info "Enabling synchronous replication (standby connected)"

    # Update postgresql.auto.conf to enable sync replication
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"ALTER SYSTEM SET synchronous_standby_names = '*'\"" >/dev/null 2>&1
    run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -c \"SELECT pg_reload_conf()\"" >/dev/null 2>&1

    ocf_log info "Synchronous replication enabled"
}

pgsql_notify() {
    local type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}"

    case "$type_op" in
        post-promote)
            # After promotion, update replication configuration
            ocf_log info "Post-promote notification received"
            ;;
        pre-demote)
            # Before demotion, prepare for standby role
            ocf_log info "Pre-demote notification received"
            ;;
        post-start)
            # After an unpromoted resource (standby) starts, enable sync replication
            if pgsql_is_promoted && [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
                # Check if there are unpromoted resources running
                local unpromoted="${OCF_RESKEY_CRM_meta_notify_unpromoted_resource}"
                if [ -n "$unpromoted" ]; then
                    ocf_log info "Post-start notification: Unpromoted resource started, enabling sync replication"
                    enable_sync_replication
                fi
            fi
            ;;
        post-stop)
            # After a node stops, check if we need to disable sync replication
            if pgsql_is_promoted; then
                local active_standbys=$(run_as_pguser sh -c "${PSQL} -p ${OCF_RESKEY_pgport} -Atc \"SELECT count(*) FROM pg_stat_replication\"" 2>/dev/null)

                if [ "$active_standbys" = "0" ] && [ "${OCF_RESKEY_rep_mode}" = "sync" ]; then
                    disable_sync_replication
                fi
            fi
            ;;
    esac

    return $OCF_SUCCESS
}

pgsql_validate() {
    # Check binaries
    check_binary "${PGCTL}"
    check_binary "${PSQL}"

    # Check data directory - be lenient, allow missing/empty for auto-initialization
    if [ ! -d "${PGDATA}" ]; then
        ocf_log info "PostgreSQL data directory ${PGDATA} does not exist - will be auto-initialized on start"
        # Try to create it to verify parent directory exists and is writable
        mkdir -p "${PGDATA}" 2>/dev/null || {
            ocf_log err "Cannot create PGDATA directory ${PGDATA} - check parent directory exists and permissions"
            return $OCF_ERR_INSTALLED
        }
        # Set ownership and permissions
        chown ${OCF_RESKEY_pguser}:${OCF_RESKEY_pguser} "${PGDATA}" 2>/dev/null && \
        chmod 750 "${PGDATA}" 2>/dev/null || {
            ocf_log warn "Cannot set ownership/permissions on ${PGDATA} - may need manual intervention"
        }
    elif [ -z "$(ls -A ${PGDATA} 2>/dev/null)" ]; then
        ocf_log info "PostgreSQL data directory ${PGDATA} is empty - will be auto-initialized on start"
    elif [ ! -f "${PGDATA}/PG_VERSION" ]; then
        ocf_log warn "PostgreSQL data directory ${PGDATA} exists but missing PG_VERSION - may be corrupted, will attempt auto-initialization"
    else
        # PGDATA exists with valid data - validate version compatibility
        if ! validate_pg_version; then
            ocf_log err "PostgreSQL version validation failed in validate phase"
            return $OCF_ERR_CONFIGURED
        fi
    fi

    # Validate application_name if provided
    if ! validate_application_name "${OCF_RESKEY_application_name}"; then
        return $OCF_ERR_CONFIGURED
    fi

    # Validate backup_before_basebackup parameter
    if [ "${OCF_RESKEY_backup_before_basebackup}" != "true" ] && [ "${OCF_RESKEY_backup_before_basebackup}" != "false" ]; then
        ocf_log err "Invalid backup_before_basebackup value: ${OCF_RESKEY_backup_before_basebackup}. Must be 'true' or 'false'"
        return $OCF_ERR_CONFIGURED
    fi

    # Validate basebackup_timeout
    if ! echo "${OCF_RESKEY_basebackup_timeout}" | grep -qE '^[0-9]+$'; then
        ocf_log err "Invalid basebackup_timeout value: ${OCF_RESKEY_basebackup_timeout}. Must be a positive integer"
        return $OCF_ERR_CONFIGURED
    fi

    # Check .pgpass file if specified
    if [ -n "${OCF_RESKEY_pgpassfile}" ] && [ ! -f "${OCF_RESKEY_pgpassfile}" ]; then
        ocf_log warn "Specified pgpassfile does not exist: ${OCF_RESKEY_pgpassfile}"
    fi

    # Validate container mode configuration (v1.6.5)
    if [ "${OCF_RESKEY_container_mode}" = "true" ] || [ "${OCF_RESKEY_container_mode}" = "yes" ]; then
        # Check if container runtime is available
        if ! command -v podman >/dev/null 2>&1 && ! command -v docker >/dev/null 2>&1; then
            ocf_log err "Container mode enabled but no container runtime found. Install podman or docker."
            return $OCF_ERR_INSTALLED
        fi

        # Validate pg_major_version format if specified
        if [ -n "${OCF_RESKEY_pg_major_version}" ] && [ "${OCF_RESKEY_pg_major_version}" != "auto" ]; then
            if ! echo "${OCF_RESKEY_pg_major_version}" | grep -qE '^[0-9]+$'; then
                ocf_log err "Invalid pg_major_version: ${OCF_RESKEY_pg_major_version}. Must be a number (e.g., 17, 16, 15)"
                return $OCF_ERR_CONFIGURED
            fi
        fi

        # Validate container_name is not empty
        if [ -z "${OCF_RESKEY_container_name}" ]; then
            ocf_log err "Container mode enabled but container_name is empty"
            return $OCF_ERR_CONFIGURED
        fi

        # Validate container_image format if specified
        if [ -n "${OCF_RESKEY_container_image}" ] && [ "${OCF_RESKEY_container_image}" != "auto" ]; then
            # Basic validation: should contain at least a colon for tag
            if ! echo "${OCF_RESKEY_container_image}" | grep -q ':'; then
                ocf_log warn "Container image '${OCF_RESKEY_container_image}' does not specify a tag. Consider using versioned tags."
            fi
        fi

        # If both pg_major_version and container_image are set, warn about priority
        if [ -n "${OCF_RESKEY_pg_major_version}" ] && [ "${OCF_RESKEY_pg_major_version}" != "auto" ] && \
           [ -n "${OCF_RESKEY_container_image}" ] && [ "${OCF_RESKEY_container_image}" != "auto" ]; then
            ocf_log info "Both pg_major_version and container_image are set. Using container_image: ${OCF_RESKEY_container_image}"
        fi

        ocf_log info "Container mode validation passed. Runtime: $(command -v podman >/dev/null 2>&1 && echo podman || echo docker)"
    fi

    return $OCF_SUCCESS
}

#######################################################################
# Main

case "$__OCF_ACTION" in
meta-data)	meta_data
		exit $OCF_SUCCESS
		;;
start)		pgsql_start;;
stop)		pgsql_stop;;
monitor)	pgsql_monitor;;
promote)	pgsql_promote;;
demote)		pgsql_demote;;
notify)		pgsql_notify;;
validate-all)	pgsql_validate;;
usage|help)	pgsql_usage
		exit $OCF_SUCCESS
		;;
*)		pgsql_usage
		exit $OCF_ERR_UNIMPLEMENTED
		;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
