服务器综合监控告警脚本
一键检查服务器核心指标:CPU、内存、磁盘、网络、关键进程状态,适合作为定时巡检脚本。
#!/bin/bash
# server_health_check.sh
HOSTNAME=$(hostname)
ALERT=0
ALERT_MSG=""
echo "=========================================="
echo " 服务器健康检查: $HOSTNAME"
echo " $(date)"
echo "=========================================="
echo ""
# CPU检查
echo "--- CPU ---"
CPU_IDLE=$(top -bn1 | grep "Cpu(s)" | awk '{print $8}' | cut -d. -f1)
CPU_USAGE=$((100 - CPU_IDLE))
LOAD=$(uptime | awk -F'load average:' '{print $2}')
echo "CPU使用率: ${CPU_USAGE}%"
echo "负载: $LOAD"
if [ "$CPU_USAGE" -gt 90 ]; then
echo " WARNING: CPU使用率超过90%"
ALERT=1
fi
echo ""
# 内存检查
echo "--- 内存 ---"
MEM_INFO=$(free -m | grep Mem)
MEM_TOTAL=$(echo "$MEM_INFO" | awk '{print $2}')
MEM_USED=$(echo "$MEM_INFO" | awk '{print $3}')
MEM_PERCENT=$((MEM_USED * 100 / MEM_TOTAL))
echo "内存: ${MEM_USED}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
if [ "$MEM_PERCENT" -gt 85 ]; then
echo " WARNING: 内存使用率超过85%"
ALERT=1
fi
echo ""
# Swap检查
SWAP_INFO=$(free -m | grep Swap)
SWAP_USED=$(echo "$SWAP_INFO" | awk '{print $3}')
if [ "$SWAP_USED" -gt 100 ]; then
echo " WARNING: Swap使用了 ${SWAP_USED}MB"
ALERT=1
fi
# 磁盘检查
echo "--- 磁盘 ---"
df -h | grep '^/dev/' | while read line; do
MOUNT=$(echo "$line" | awk '{print $6}')
USAGE=$(echo "$line" | awk '{print $5}' | tr -d '%')
echo "$MOUNT: ${USAGE}%"
if [ "$USAGE" -gt 85 ]; then
echo " WARNING: $MOUNT 使用率 ${USAGE}%"
fi
done
echo ""
# 关键进程检查
echo "--- 关键进程 ---"
for svc in nginx httpd php-fpm mysqld redis; do
if systemctl is-active "$svc" &>/dev/null; then
echo " $svc: running"
else
echo " $svc: STOPPED"
ALERT=1
fi
done
echo ""
# 网络连接数
echo "--- 网络 ---"
CONN_COUNT=$(ss -s | grep estab | awk '{print $2}')
echo "已建立连接数: $CONN_COUNT"
TIME_WAIT=$(ss -ant | grep TIME-WAIT | wc -l)
echo "TIME_WAIT连接: $TIME_WAIT"
echo ""
# 最近登录
echo "--- 最近登录 ---"
last -5 -w
echo ""
if [ "$ALERT" -eq 0 ]; then
echo "结果: 全部正常"
else
echo "结果: 存在异常,请检查上述WARNING项"
fi