Commit 153db18a authored by nextime's avatar nextime

Fix watchdog behavior to prevent unnecessary restarts

- Change watchdogs from continuous monitoring to one-time start
- Check START=yes in /etc/default/ before starting services
- Only start services if they're not already running
- Double-check processes are actually running and correct
- Improve status and stop functions with process verification
- Fix both wssshd and wssshc watchdogs
parent e348dd54
......@@ -34,12 +34,19 @@ log_message() {
logger -t "$DAEMON_NAME-watchdog" "$*"
}
# Function to check if daemon is running
# Function to check if daemon is running (double-check process)
is_daemon_running() {
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# First check if process exists
if kill -0 "$pid" 2>/dev/null; then
return 0 # Running
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshd$"; then
return 0 # Running and correct process
else
log_message "PID file exists but process $pid is not wssshd"
rm -f "$PID_FILE"
fi
else
log_message "PID file exists but process $pid is not running"
rm -f "$PID_FILE"
......@@ -98,13 +105,27 @@ start_daemon() {
fi
}
# Function to stop daemon
# Function to stop daemon (double-check process)
stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..."
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshd$"; then
start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5
local result=$?
rm -f "$PID_FILE"
return $result
else
log_message "PID file exists but process $pid is not wssshd"
rm -f "$PID_FILE"
return 1
fi
else
log_message "PID file not found, daemon may not be running"
return 1
fi
}
# Function to check restart limits
......@@ -140,7 +161,7 @@ cleanup() {
# Trap signals
trap cleanup SIGTERM SIGINT
# Main watchdog loop
# Main watchdog function (one-time start, not continuous monitoring)
main() {
# Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE)
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
......@@ -152,32 +173,25 @@ main() {
# Store watchdog PID
echo $$ > "$WATCHDOG_PID_FILE"
log_message "Watchdog started for $DAEMON_NAME"
log_message "Check interval: $CHECK_INTERVAL seconds"
log_message "Max restarts: $MAX_RESTARTS per $RESTART_WINDOW seconds"
while true; do
if ! is_daemon_running; then
log_message "$DAEMON_NAME is not running"
log_message "Watchdog starting $DAEMON_NAME"
# Check restart limits
if ! check_restart_limits; then
log_message "Restart limits exceeded. Watchdog will not restart $DAEMON_NAME."
break
# Check if daemon is already running
if is_daemon_running; then
log_message "$DAEMON_NAME is already running"
cleanup
exit 0
fi
# Attempt to start daemon
if start_daemon; then
log_message "$DAEMON_NAME restarted successfully"
log_message "$DAEMON_NAME started successfully"
else
log_message "Failed to restart $DAEMON_NAME"
fi
log_message "Failed to start $DAEMON_NAME"
cleanup
exit 1
fi
sleep "$CHECK_INTERVAL"
done
log_message "Watchdog exiting"
log_message "Watchdog exiting (one-time start completed)"
cleanup
}
......@@ -204,19 +218,45 @@ case "$1" in
;;
stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then
kill "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null
local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshd-watchdog"; then
kill "$watchdog_pid" 2>/dev/null
rm -f "$WATCHDOG_PID_FILE"
echo "Watchdog stopped"
else
echo "PID file exists but process is not wssshd-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog is not running"
fi
;;
status)
if [ -f "$WATCHDOG_PID_FILE" ] && kill -0 "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null; then
echo "Watchdog is running (PID: $(cat "$WATCHDOG_PID_FILE"))"
if [ -f "$WATCHDOG_PID_FILE" ]; then
local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
if kill -0 "$watchdog_pid" 2>/dev/null; then
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshd-watchdog"; then
echo "Watchdog is running (PID: $watchdog_pid)"
else
echo "Watchdog PID file exists but process is not wssshd-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog PID file exists but process is not running"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog is not running"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
fi
# Also check daemon status
if is_daemon_running; then
local daemon_pid=$(cat "$PID_FILE")
echo "Daemon is running (PID: $daemon_pid)"
else
echo "Daemon is not running"
fi
;;
restart)
......
......@@ -34,12 +34,19 @@ log_message() {
logger -t "$DAEMON_NAME-watchdog" "$*"
}
# Function to check if daemon is running
# Function to check if daemon is running (double-check process)
is_daemon_running() {
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# First check if process exists
if kill -0 "$pid" 2>/dev/null; then
return 0 # Running
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshc$"; then
return 0 # Running and correct process
else
log_message "PID file exists but process $pid is not wssshc"
rm -f "$PID_FILE"
fi
else
log_message "PID file exists but process $pid is not running"
rm -f "$PID_FILE"
......@@ -98,13 +105,27 @@ start_daemon() {
fi
}
# Function to stop daemon
# Function to stop daemon (double-check process)
stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..."
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
# Double-check: verify the process is actually our daemon
if ps -p "$pid" -o comm= 2>/dev/null | grep -q "^wssshc$"; then
start-stop-daemon --stop --quiet --pidfile "$PID_FILE" --retry=TERM/30/KILL/5
local result=$?
rm -f "$PID_FILE"
return $result
else
log_message "PID file exists but process $pid is not wssshc"
rm -f "$PID_FILE"
return 1
fi
else
log_message "PID file not found, daemon may not be running"
return 1
fi
}
# Function to check restart limits
......@@ -140,7 +161,7 @@ cleanup() {
# Trap signals
trap cleanup SIGTERM SIGINT
# Main watchdog loop
# Main watchdog function (one-time start, not continuous monitoring)
main() {
# Check if START is enabled (accept various forms: yes, YES, Y, 1, true, TRUE)
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
......@@ -152,32 +173,25 @@ main() {
# Store watchdog PID
echo $$ > "$WATCHDOG_PID_FILE"
log_message "Watchdog started for $DAEMON_NAME"
log_message "Check interval: $CHECK_INTERVAL seconds"
log_message "Max restarts: $MAX_RESTARTS per $RESTART_WINDOW seconds"
while true; do
if ! is_daemon_running; then
log_message "$DAEMON_NAME is not running"
log_message "Watchdog starting $DAEMON_NAME"
# Check restart limits
if ! check_restart_limits; then
log_message "Restart limits exceeded. Watchdog will not restart $DAEMON_NAME."
break
# Check if daemon is already running
if is_daemon_running; then
log_message "$DAEMON_NAME is already running"
cleanup
exit 0
fi
# Attempt to start daemon
if start_daemon; then
log_message "$DAEMON_NAME restarted successfully"
log_message "$DAEMON_NAME started successfully"
else
log_message "Failed to restart $DAEMON_NAME"
fi
log_message "Failed to start $DAEMON_NAME"
cleanup
exit 1
fi
sleep "$CHECK_INTERVAL"
done
log_message "Watchdog exiting"
log_message "Watchdog exiting (one-time start completed)"
cleanup
}
......@@ -204,19 +218,45 @@ case "$1" in
;;
stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then
kill "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null
local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshc-watchdog"; then
kill "$watchdog_pid" 2>/dev/null
rm -f "$WATCHDOG_PID_FILE"
echo "Watchdog stopped"
else
echo "PID file exists but process is not wssshc-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog is not running"
fi
;;
status)
if [ -f "$WATCHDOG_PID_FILE" ] && kill -0 "$(cat "$WATCHDOG_PID_FILE")" 2>/dev/null; then
echo "Watchdog is running (PID: $(cat "$WATCHDOG_PID_FILE"))"
if [ -f "$WATCHDOG_PID_FILE" ]; then
local watchdog_pid=$(cat "$WATCHDOG_PID_FILE")
if kill -0 "$watchdog_pid" 2>/dev/null; then
# Double-check: verify the process is actually our watchdog
if ps -p "$watchdog_pid" -o comm= 2>/dev/null | grep -q "wssshc-watchdog"; then
echo "Watchdog is running (PID: $watchdog_pid)"
else
echo "Watchdog PID file exists but process is not wssshc-watchdog"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog PID file exists but process is not running"
rm -f "$WATCHDOG_PID_FILE"
fi
else
echo "Watchdog is not running"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
fi
# Also check daemon status
if is_daemon_running; then
local daemon_pid=$(cat "$PID_FILE")
echo "Daemon is running (PID: $daemon_pid)"
else
echo "Daemon is not running"
fi
;;
restart)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment