Commit 153db18a authored by nextime's avatar nextime

Fix watchdog behavior to prevent unnecessary restarts

- Change watchdogs from continuous monitoring to one-time start
- Check START=yes in /etc/default/ before starting services
- Only start services if they're not already running
- Double-check processes are actually running and correct
- Improve status and stop functions with process verification
- Fix both wssshd and wssshc watchdogs
parent e348dd54
...@@ -34,12 +34,19 @@ log_message() { ...@@ -34,12 +34,19 @@ log_message() {
logger -t "$DAEMON_NAME-watchdog" "$*" logger -t "$DAEMON_NAME-watchdog" "$*"
} }
# Function to check if daemon is running # Function to check if daemon is running (double-check process)
is_daemon_running() { is_daemon_running() {
if [ -f "$PID_FILE" ]; then if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE") local pid=$(cat "$PID_FILE")
# First check if process exists
if kill -0 "$pid" 2>/dev/null; then if kill -0 "$pid" 2>/dev/null; then
return 0 # Running # Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshd$"; then
return 0 # Running and correct process
else
log_message "PID file exists but process $pid is not wssshd"
rm -f "$PID_FILE"
fi
else else
log_message "PID file exists but process $pid is not running" log_message "PID file exists but process $pid is not running"
rm -f "$PID_FILE" rm -f "$PID_FILE"
...@@ -98,13 +105,27 @@ start_daemon() { ...@@ -98,13 +105,27 @@ start_daemon() {
fi fi
} }
# Function to stop daemon # Function to stop daemon (double-check process)
stop_daemon() { stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..." log_message "Stopping $DAEMON_NAME daemon..."
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshd$"; then
start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5 start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5
local result=$? local result=$?
rm -f "$PID_FILE" rm -f "$PID_FILE"
return $result return $result
else
log_message "PID file exists but process $pid is not wssshd"
rm -f "$PID_FILE"
return 1
fi
else
log_message "PID file not found, daemon may not be running"
return 1
fi
} }
# Function to check restart limits # Function to check restart limits
...@@ -140,7 +161,7 @@ cleanup() { ...@@ -140,7 +161,7 @@ cleanup() {
# Trap signals # Trap signals
trap cleanup SIGTERM SIGINT trap cleanup SIGTERM SIGINT
# Main watchdog loop # Main watchdog function (one-time start, not continuous monitoring)
main() { main() {
# Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE) # Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE)
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]') START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
...@@ -152,32 +173,25 @@ main() { ...@@ -152,32 +173,25 @@ main() {
# Store watchdog PID # Store watchdog PID
echo $$ > "$WATCHDOG_PID_FILE" echo $$ > "$WATCHDOG_PID_FILE"
log_message "Watchdog started for $DAEMON_NAME" log_message "Watchdog starting $DAEMON_NAME"
log_message "Check interval: $CHECK_INTERVAL seconds"
log_message "Max restarts: $MAX_RESTARTS per $RESTART_WINDOW seconds"
while true; do
if ! is_daemon_running; then
log_message "$DAEMON_NAME is not running"
# Check restart limits # Check if daemon is already running
if ! check_restart_limits; then if is_daemon_running; then
log_message "Restart limits exceeded. Watchdog will not restart $DAEMON_NAME." log_message "$DAEMON_NAME is already running"
break cleanup
exit 0
fi fi
# Attempt to start daemon # Attempt to start daemon
if start_daemon; then if start_daemon; then
log_message "$DAEMON_NAME restarted successfully" log_message "$DAEMON_NAME started successfully"
else else
log_message "Failed to restart $DAEMON_NAME" log_message "Failed to start $DAEMON_NAME"
fi cleanup
exit 1
fi fi
sleep "$CHECK_INTERVAL" log_message "Watchdog exiting (one-time start completed)"
done
log_message "Watchdog exiting"
cleanup cleanup
} }
...@@ -204,19 +218,45 @@ case "$1" in ...@@ -204,19 +218,45 @@ case "$1" in
;; ;;
stop) stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then if [ -f "$WATCHDOG_PID_FILE" ]; then
kill "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshd-watchdog"; then
kill "$watchdog_pid" 2>/dev/null
rm -f "$WATCHDOG_PID_FILE" rm -f "$WATCHDOG_PID_FILE"
echo "Watchdog stopped" echo "Watchdog stopped"
else
echo "PID file exists but process is not wssshd-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else else
echo "Watchdog is not running" echo "Watchdog is not running"
fi fi
;; ;;
status) status)
if [ -f "$WATCHDOG_PID_FILE" ] && kill -0 "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null; then if [ -f "$WATCHDOG_PID_FILE" ]; then
echo "Watchdog is running (PID: $(cat "$WATCHDOG_PID_FILE"))" local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
if kill -0 "$watchdog_pid" 2>/dev/null; then
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshd-watchdog"; then
echo "Watchdog is running (PID: $watchdog_pid)"
else
echo "Watchdog PID file exists but process is not wssshd-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog PID file exists but process is not running"
rm -f "$WATCHDOG_PID_FILE"
fi
else else
echo "Watchdog is not running" echo "Watchdog is not running"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null fi
# Also check daemon status
if is_daemon_running; then
local daemon_pid=$(cat "$PID_FILE")
echo "Daemon is running (PID: $daemon_pid)"
else
echo "Daemon is not running"
fi fi
;; ;;
restart) restart)
......
...@@ -34,12 +34,19 @@ log_message() { ...@@ -34,12 +34,19 @@ log_message() {
logger -t "$DAEMON_NAME-watchdog" "$*" logger -t "$DAEMON_NAME-watchdog" "$*"
} }
# Function to check if daemon is running # Function to check if daemon is running (double-check process)
is_daemon_running() { is_daemon_running() {
if [ -f "$PID_FILE" ]; then if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE") local pid=$(cat "$PID_FILE")
# First check if process exists
if kill -0 "$pid" 2>/dev/null; then if kill -0 "$pid" 2>/dev/null; then
return 0 # Running # Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshc$"; then
return 0 # Running and correct process
else
log_message "PID file exists but process $pid is not wssshc"
rm -f "$PID_FILE"
fi
else else
log_message "PID file exists but process $pid is not running" log_message "PID file exists but process $pid is not running"
rm -f "$PID_FILE" rm -f "$PID_FILE"
...@@ -98,13 +105,27 @@ start_daemon() { ...@@ -98,13 +105,27 @@ start_daemon() {
fi fi
} }
# Function to stop daemon # Function to stop daemon (double-check process)
stop_daemon() { stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..." log_message "Stopping $DAEMON_NAME daemon..."
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshc$"; then
start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5 start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5
local result=$? local result=$?
rm -f "$PID_FILE" rm -f "$PID_FILE"
return $result return $result
else
log_message "PID file exists but process $pid is not wssshc"
rm -f "$PID_FILE"
return 1
fi
else
log_message "PID file not found, daemon may not be running"
return 1
fi
} }
# Function to check restart limits # Function to check restart limits
...@@ -140,7 +161,7 @@ cleanup() { ...@@ -140,7 +161,7 @@ cleanup() {
# Trap signals # Trap signals
trap cleanup SIGTERM SIGINT trap cleanup SIGTERM SIGINT
# Main watchdog loop # Main watchdog function (one-time start, not continuous monitoring)
main() { main() {
# Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE) # Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE)
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]') START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
...@@ -152,32 +173,25 @@ main() { ...@@ -152,32 +173,25 @@ main() {
# Store watchdog PID # Store watchdog PID
echo $$ > "$WATCHDOG_PID_FILE" echo $$ > "$WATCHDOG_PID_FILE"
log_message "Watchdog started for $DAEMON_NAME" log_message "Watchdog starting $DAEMON_NAME"
log_message "Check interval: $CHECK_INTERVAL seconds"
log_message "Max restarts: $MAX_RESTARTS per $RESTART_WINDOW seconds"
while true; do
if ! is_daemon_running; then
log_message "$DAEMON_NAME is not running"
# Check restart limits # Check if daemon is already running
if ! check_restart_limits; then if is_daemon_running; then
log_message "Restart limits exceeded. Watchdog will not restart $DAEMON_NAME." log_message "$DAEMON_NAME is already running"
break cleanup
exit 0
fi fi
# Attempt to start daemon # Attempt to start daemon
if start_daemon; then if start_daemon; then
log_message "$DAEMON_NAME restarted successfully" log_message "$DAEMON_NAME started successfully"
else else
log_message "Failed to restart $DAEMON_NAME" log_message "Failed to start $DAEMON_NAME"
fi cleanup
exit 1
fi fi
sleep "$CHECK_INTERVAL" log_message "Watchdog exiting (one-time start completed)"
done
log_message "Watchdog exiting"
cleanup cleanup
} }
...@@ -204,19 +218,45 @@ case "$1" in ...@@ -204,19 +218,45 @@ case "$1" in
;; ;;
stop) stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then if [ -f "$WATCHDOG_PID_FILE" ]; then
kill "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshc-watchdog"; then
kill "$watchdog_pid" 2>/dev/null
rm -f "$WATCHDOG_PID_FILE" rm -f "$WATCHDOG_PID_FILE"
echo "Watchdog stopped" echo "Watchdog stopped"
else
echo "PID file exists but process is not wssshc-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else else
echo "Watchdog is not running" echo "Watchdog is not running"
fi fi
;; ;;
status) status)
if [ -f "$WATCHDOG_PID_FILE" ] && kill -0 "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null; then if [ -f "$WATCHDOG_PID_FILE" ]; then
echo "Watchdog is running (PID: $(cat "$WATCHDOG_PID_FILE"))" local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
if kill -0 "$watchdog_pid" 2>/dev/null; then
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshc-watchdog"; then
echo "Watchdog is running (PID: $watchdog_pid)"
else
echo "Watchdog PID file exists but process is not wssshc-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog PID file exists but process is not running"
rm -f "$WATCHDOG_PID_FILE"
fi
else else
echo "Watchdog is not running" echo "Watchdog is not running"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null fi
# Also check daemon status
if is_daemon_running; then
local daemon_pid=$(cat "$PID_FILE")
echo "Daemon is running (PID: $daemon_pid)"
else
echo "Daemon is not running"
fi fi
;; ;;
restart) restart)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment