Commit 3a5568d4 authored by nextime's avatar nextime

Complete watchdog fix with restart limiting

- Implement one-time start logic (no continuous monitoring)
- Add restart limiting: max 20 restarts per minute
- Watchdog exits after successful start or if limits exceeded
- Double-check processes are actually running and correct
- Clean up PID files properly on exit
- Fixed both wssshd and wssshc watchdogs
parent 153db18a
...@@ -14,9 +14,8 @@ DAEMON_PATH="/usr/bin/wssshd" ...@@ -14,9 +14,8 @@ DAEMON_PATH="/usr/bin/wssshd"
PID_FILE="/var/run/wssshd.pid" PID_FILE="/var/run/wssshd.pid"
WATCHDOG_PID_FILE="/var/run/wssshd-watchdog.pid" WATCHDOG_PID_FILE="/var/run/wssshd-watchdog.pid"
LOG_FILE="/var/log/wssshd/watchdog.log" LOG_FILE="/var/log/wssshd/watchdog.log"
CHECK_INTERVAL=30 MAX_RESTARTS=20
MAX_RESTARTS=5 RESTART_WINDOW=60 # 1 minute
RESTART_WINDOW=300 # 5 minutes
# Default configuration values (can be overridden by /etc/default/wssshd) # Default configuration values (can be overridden by /etc/default/wssshd)
START=yes START=yes
...@@ -105,6 +104,42 @@ start_daemon() { ...@@ -105,6 +104,42 @@ start_daemon() {
fi fi
} }
# Function to check restart limits
check_restart_limits() {
local current_time=$(date +%s)
local restart_count=0
local window_start=$((current_time - RESTART_WINDOW))
# Count successful starts within the restart window
if [ -f "$LOG_FILE" ]; then
# Count starts in the last RESTART_WINDOW seconds
restart_count=$(awk -v window_start="$window_start" '
BEGIN { count = 0 }
{
# Extract timestamp from log line [YYYY-MM-DD HH:MM:SS]
if (match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\]/, arr)) {
timestamp = arr[1]
# Convert to epoch time
cmd = "date -d \"" timestamp "\" +%s 2>/dev/null"
cmd | getline epoch_time
close(cmd)
if (epoch_time >= window_start && $0 ~ /started successfully/) {
count++
}
}
}
END { print count }
' "$LOG_FILE" 2>/dev/null || echo "0")
fi
if [ "$restart_count" -ge "$MAX_RESTARTS" ]; then
log_message "Too many restarts ($restart_count) in $RESTART_WINDOW seconds. Watchdog will exit."
return 1
fi
return 0
}
# Function to stop daemon (double-check process) # Function to stop daemon (double-check process)
stop_daemon() { stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..." log_message "Stopping $DAEMON_NAME daemon..."
...@@ -149,17 +184,7 @@ check_restart_limits() { ...@@ -149,17 +184,7 @@ check_restart_limits() {
return 0 return 0
} }
# Function to cleanup on exit # Cleanup is now handled directly in main() function
cleanup() {
log_message "Watchdog shutting down..."
if [ -f "$WATCHDOG_PID_FILE" ]; then
rm -f "$WATCHDOG_PID_FILE"
fi
exit 0
}
# Trap signals
trap cleanup SIGTERM SIGINT
# Main watchdog function (one-time start, not continuous monitoring) # Main watchdog function (one-time start, not continuous monitoring)
main() { main() {
...@@ -167,6 +192,7 @@ main() { ...@@ -167,6 +192,7 @@ main() {
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]') START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
if [ "$START_LOWER" != "yes" ] && [ "$START_LOWER" != "y" ] && [ "$START_LOWER" != "1" ] && [ "$START_LOWER" != "true" ]; then if [ "$START_LOWER" != "yes" ] && [ "$START_LOWER" != "y" ] && [ "$START_LOWER" != "1" ] && [ "$START_LOWER" != "true" ]; then
log_message "START is not set to a valid enabled value in /etc/default/wssshd. Exiting." log_message "START is not set to a valid enabled value in /etc/default/wssshd. Exiting."
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0 exit 0
fi fi
...@@ -178,21 +204,27 @@ main() { ...@@ -178,21 +204,27 @@ main() {
# Check if daemon is already running # Check if daemon is already running
if is_daemon_running; then if is_daemon_running; then
log_message "$DAEMON_NAME is already running" log_message "$DAEMON_NAME is already running"
cleanup rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0 exit 0
fi fi
# Check restart limits before attempting to start
if ! check_restart_limits; then
log_message "Restart limits exceeded, not starting $DAEMON_NAME"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 1
fi
# Attempt to start daemon # Attempt to start daemon
if start_daemon; then if start_daemon; then
log_message "$DAEMON_NAME started successfully" log_message "$DAEMON_NAME started successfully"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0
else else
log_message "Failed to start $DAEMON_NAME" log_message "Failed to start $DAEMON_NAME"
cleanup rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 1 exit 1
fi fi
log_message "Watchdog exiting (one-time start completed)"
cleanup
} }
# Handle command line arguments # Handle command line arguments
...@@ -203,18 +235,9 @@ case "$1" in ...@@ -203,18 +235,9 @@ case "$1" in
exit 1 exit 1
fi fi
main & main &
# Wait for PID file to be created (max 5 seconds) # Wait briefly for main() to complete its work
local count=0 sleep 2
while [ $count -lt 10 ] && [ ! -f "$WATCHDOG_PID_FILE" ]; do echo "Watchdog start process completed"
sleep 0.5
count=$((count + 1))
done
if [ -f "$WATCHDOG_PID_FILE" ]; then
echo "Watchdog started"
else
echo "Watchdog failed to create PID file"
exit 1
fi
;; ;;
stop) stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then if [ -f "$WATCHDOG_PID_FILE" ]; then
......
...@@ -14,9 +14,8 @@ DAEMON_PATH="/usr/bin/wssshc" ...@@ -14,9 +14,8 @@ DAEMON_PATH="/usr/bin/wssshc"
PID_FILE="/var/run/wssshc.pid" PID_FILE="/var/run/wssshc.pid"
WATCHDOG_PID_FILE="/var/run/wssshc-watchdog.pid" WATCHDOG_PID_FILE="/var/run/wssshc-watchdog.pid"
LOG_FILE="/var/log/wssshc/watchdog.log" LOG_FILE="/var/log/wssshc/watchdog.log"
CHECK_INTERVAL=30 MAX_RESTARTS=20
MAX_RESTARTS=5 RESTART_WINDOW=60 # 1 minute
RESTART_WINDOW=300 # 5 minutes
# Default configuration values (can be overridden by /etc/default/wssshc) # Default configuration values (can be overridden by /etc/default/wssshc)
START=yes START=yes
...@@ -105,6 +104,42 @@ start_daemon() { ...@@ -105,6 +104,42 @@ start_daemon() {
fi fi
} }
# Function to check restart limits
check_restart_limits() {
local current_time=$(date +%s)
local restart_count=0
local window_start=$((current_time - RESTART_WINDOW))
# Count successful starts within the restart window
if [ -f "$LOG_FILE" ]; then
# Count starts in the last RESTART_WINDOW seconds
restart_count=$(awk -v window_start="$window_start" '
BEGIN { count = 0 }
{
# Extract timestamp from log line [YYYY-MM-DD HH:MM:SS]
if (match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\]/, arr)) {
timestamp = arr[1]
# Convert to epoch time
cmd = "date -d \"" timestamp "\" +%s 2>/dev/null"
cmd | getline epoch_time
close(cmd)
if (epoch_time >= window_start && $0 ~ /started successfully/) {
count++
}
}
}
END { print count }
' "$LOG_FILE" 2>/dev/null || echo "0")
fi
if [ "$restart_count" -ge "$MAX_RESTARTS" ]; then
log_message "Too many restarts ($restart_count) in $RESTART_WINDOW seconds. Watchdog will exit."
return 1
fi
return 0
}
# Function to stop daemon (double-check process) # Function to stop daemon (double-check process)
stop_daemon() { stop_daemon() {
log_message "Stopping $DAEMON_NAME daemon..." log_message "Stopping $DAEMON_NAME daemon..."
...@@ -149,17 +184,7 @@ check_restart_limits() { ...@@ -149,17 +184,7 @@ check_restart_limits() {
return 0 return 0
} }
# Function to cleanup on exit # Cleanup is now handled directly in main() function
cleanup() {
log_message "Watchdog shutting down..."
if [ -f "$WATCHDOG_PID_FILE" ]; then
rm -f "$WATCHDOG_PID_FILE"
fi
exit 0
}
# Trap signals
trap cleanup SIGTERM SIGINT
# Main watchdog function (one-time start, not continuous monitoring) # Main watchdog function (one-time start, not continuous monitoring)
main() { main() {
...@@ -167,6 +192,7 @@ main() { ...@@ -167,6 +192,7 @@ main() {
START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]') START_LOWER=$(echo "$START" | tr '[:upper:]' '[:lower:]')
if [ "$START_LOWER" != "yes" ] && [ "$START_LOWER" != "y" ] && [ "$START_LOWER" != "1" ] && [ "$START_LOWER" != "true" ]; then if [ "$START_LOWER" != "yes" ] && [ "$START_LOWER" != "y" ] && [ "$START_LOWER" != "1" ] && [ "$START_LOWER" != "true" ]; then
log_message "START is not set to a valid enabled value in /etc/default/wssshc. Exiting." log_message "START is not set to a valid enabled value in /etc/default/wssshc. Exiting."
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0 exit 0
fi fi
...@@ -178,21 +204,27 @@ main() { ...@@ -178,21 +204,27 @@ main() {
# Check if daemon is already running # Check if daemon is already running
if is_daemon_running; then if is_daemon_running; then
log_message "$DAEMON_NAME is already running" log_message "$DAEMON_NAME is already running"
cleanup rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0 exit 0
fi fi
# Check restart limits before attempting to start
if ! check_restart_limits; then
log_message "Restart limits exceeded, not starting $DAEMON_NAME"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 1
fi
# Attempt to start daemon # Attempt to start daemon
if start_daemon; then if start_daemon; then
log_message "$DAEMON_NAME started successfully" log_message "$DAEMON_NAME started successfully"
rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 0
else else
log_message "Failed to start $DAEMON_NAME" log_message "Failed to start $DAEMON_NAME"
cleanup rm -f "$WATCHDOG_PID_FILE" 2>/dev/null
exit 1 exit 1
fi fi
log_message "Watchdog exiting (one-time start completed)"
cleanup
} }
# Handle command line arguments # Handle command line arguments
...@@ -203,18 +235,9 @@ case "$1" in ...@@ -203,18 +235,9 @@ case "$1" in
exit 1 exit 1
fi fi
main & main &
# Wait for PID file to be created (max 5 seconds) # Wait briefly for main() to complete its work
local count=0 sleep 2
while [ $count -lt 10 ] && [ ! -f "$WATCHDOG_PID_FILE" ]; do echo "Watchdog start process completed"
sleep 0.5
count=$((count + 1))
done
if [ -f "$WATCHDOG_PID_FILE" ]; then
echo "Watchdog started"
else
echo "Watchdog failed to create PID file"
exit 1
fi
;; ;;
stop) stop)
if [ -f "$WATCHDOG_PID_FILE" ]; then if [ -f "$WATCHDOG_PID_FILE" ]; then
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment