Linux

关注公众号 jb51net

关闭
首页 > 网站技巧 > 服务器 > Linux > Linux运维实用脚本集

Linux运维实用脚本集详解

作者:ICT董老师

本文分享了作者在实际生产环境中积累的实用脚本,涵盖了监控、备份、诊断、自动化等多个场景,脚本建议保存在/opt/scripts/目录,设置权限并添加日志记录和错误处理,作者还提供了一些脚本示例,包括系统监控、备份、自动化部署等

以下是在实际生产环境中积累的实用脚本,涵盖监控、备份、诊断、自动化等场景。

脚本使用建议

所有脚本保存到 /opt/scripts/ 目录并设置权限

添加日志记录和错误处理

通过 crontab -e 设置定时任务

一、系统监控与告警类

1.1 综合系统监控脚本

#!/bin/bash
# 文件名:/opt/scripts/system_monitor.sh
# 功能:综合系统状态监控,适合加入cron定时执行
# 执行频率:建议每5分钟一次

LOG_FILE="/var/log/system_monitor.log"
THRESHOLD_CPU=80     # CPU使用率阈值%
THRESHOLD_MEM=85     # 内存使用率阈值%
THRESHOLD_DISK=90    # 磁盘使用率阈值%

# 获取当前时间
echo "====== $(date '+%Y-%m-%d %H:%M:%S') ======" | tee -a $LOG_FILE

# 1. CPU监控
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
if (( $(echo "$CPU_USAGE > $THRESHOLD_CPU" | bc -l) )); then
    echo "⚠️  警告:CPU使用率过高 - ${CPU_USAGE}%" | tee -a $LOG_FILE
    # 这里可以添加告警动作,如发送邮件
    # /opt/scripts/send_alert.sh "CPU报警" "CPU使用率: ${CPU_USAGE}%"
fi

# 2. 内存监控
MEM_TOTAL=$(free -m | awk '/Mem:/ {print $2}')
MEM_USED=$(free -m | awk '/Mem:/ {print $3}')
MEM_PERCENT=$((MEM_USED*100/MEM_TOTAL))
if [ $MEM_PERCENT -gt $THRESHOLD_MEM ]; then
    echo "⚠️  警告:内存使用率过高 - ${MEM_PERCENT}%" | tee -a $LOG_FILE
    # 显示内存占用前10的进程
    ps aux --sort=-%mem | head -11 | tee -a $LOG_FILE
fi

# 3. 磁盘监控
df -h | grep -E '^/dev/' | while read line; do
    USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
    MOUNT=$(echo $line | awk '{print $6}')
    if [ $USAGE -gt $THRESHOLD_DISK ]; then
        echo "⚠️  警告:磁盘 $MOUNT 使用率过高 - ${USAGE}%" | tee -a $LOG_FILE
    fi
done

# 4. 关键进程检查
PROCESS_LIST=("nginx" "mysql" "redis" "sshd")
for proc in "${PROCESS_LIST[@]}"; do
    if ! pgrep -x "$proc" >/dev/null; then
        echo "❌ 关键进程 $proc 未运行!" | tee -a $LOG_FILE
    fi
done

# 5. 连接数监控(针对Web服务器)
if command -v netstat &> /dev/null; then
    CONN_COUNT=$(netstat -ant | grep -c ESTABLISHED)
    echo "当前ESTABLISHED连接数: $CONN_COUNT" | tee -a $LOG_FILE
fi

echo "监控完成" | tee -a $LOG_FILE

1.2 实时进程资源监控

#!/bin/bash
# 文件名:/opt/scripts/process_watch.sh
# 功能:监控指定进程的资源占用,类似简易版top
# 用法:./process_watch.sh <进程名或PID>

PROCESS_NAME=$1
INTERVAL=2  # 监控间隔(秒)

if [ -z "$PROCESS_NAME" ]; then
    echo "用法: $0 <进程名或PID>"
    exit 1
fi

echo "监控进程: $PROCESS_NAME,按Ctrl+C退出"
echo "时间戳          PID   CPU%   MEM%   虚拟内存   物理内存   进程名"
echo "----------------------------------------------------------------"

while true; do
    # 通过ps获取进程信息
    ps aux | grep -E "(PID|$PROCESS_NAME)" | grep -v grep | grep -v $0 | \
    awk -v date="$(date '+%H:%M:%S')" '{
        printf "%s %6s %5s %6s %10s %10s   %s\n", 
        date, $2, $3, $4, $5, $6, $11
    }'
    
    # 显示进程打开的文件数(如果进程存在)
    PID=$(ps aux | grep "$PROCESS_NAME" | grep -v grep | head -1 | awk '{print $2}')
    if [ ! -z "$PID" ]; then
        FILE_COUNT=$(ls -l /proc/$PID/fd 2>/dev/null | wc -l)
        echo "  打开文件数: ${FILE_COUNT:-N/A}"
    fi
    
    sleep $INTERVAL
done

二、备份与同步类

2.1 智能增量备份脚本

#!/bin/bash
# 文件名:/opt/scripts/smart_backup.sh
# 功能:增量备份,保留最近7天,每周日全量备份
# 配置:修改SOURCE_DIR和BACKUP_DIR

SOURCE_DIR="/data/www"           # 备份源目录
BACKUP_DIR="/backup/www"         # 备份目标目录
RETENTION_DAYS=7                 # 保留天数
DATE=$(date '+%Y%m%d_%H%M%S')
BACKUP_LOG="/var/log/backup_${DATE}.log"

# 创建目录
mkdir -p $BACKUP_DIR/{full,incremental}

# 日志函数
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $BACKUP_LOG
}

# 检查磁盘空间
check_disk_space() {
    local required=$1
    local available=$(df $BACKUP_DIR | awk 'NR==2 {print $4}')
    
    if [ $available -lt $required ]; then
        log "❌ 磁盘空间不足!可用: ${available}KB, 需要: ${required}KB"
        exit 1
    fi
}

# 周日做全量备份,其他天做增量备份
if [ $(date '+%u') -eq 7 ]; then
    BACKUP_TYPE="full"
    BACKUP_PATH="$BACKUP_DIR/full/backup_${DATE}.tar.gz"
    
    # 估算大小(单位KB)
    EST_SIZE=$(du -sk $SOURCE_DIR | awk '{print $1}')
    check_disk_space $((EST_SIZE * 110 / 100))  # 增加10%缓冲
    
    log "开始全量备份..."
    tar -czf $BACKUP_PATH $SOURCE_DIR 2>>$BACKUP_LOG
    
    # 删除旧的全量备份
    find $BACKUP_DIR/full -type f -mtime +$RETENTION_DAYS -delete
    
else
    BACKUP_TYPE="incremental"
    LATEST_FULL=$(ls -t $BACKUP_DIR/full/*.tar.gz 2>/dev/null | head -1)
    
    if [ -z "$LATEST_FULL" ]; then
        log "未找到全量备份,执行全量备份..."
        $0 --force-full
        exit 0
    fi
    
    BACKUP_PATH="$BACKUP_DIR/incremental/inc_${DATE}.tar.gz"
    
    # 查找需要备份的修改文件(最近24小时内)
    find $SOURCE_DIR -type f -mtime -1 -print > /tmp/changed_files.list
    
    if [ -s /tmp/changed_files.list ]; then
        tar -czf $BACKUP_PATH -T /tmp/changed_files.list 2>>$BACKUP_LOG
        log "增量备份完成,文件数: $(wc -l < /tmp/changed_files.list)"
    else
        log "没有文件变化,跳过备份"
    fi
    
    rm -f /tmp/changed_files.list
fi

# 验证备份文件
if [ -f $BACKUP_PATH ]; then
    BACKUP_SIZE=$(du -h $BACKUP_PATH | awk '{print $1}')
    log "✅ 备份成功: $BACKUP_TYPE备份, 大小: $BACKUP_SIZE, 路径: $BACKUP_PATH"
    
    # 发送成功通知(可选)
    # echo "备份成功: $(hostname) - $BACKUP_TYPE" | mail -s "备份成功通知" admin@example.com
else
    log "❌ 备份失败!"
    exit 1
fi

# 清理旧日志
find /var/log/ -name "backup_*.log" -mtime +30 -delete

2.2 MySQL数据库备份

#!/bin/bash
# 文件名:/opt/scripts/mysql_backup.sh
# 功能:MySQL数据库备份,支持单库、多库、全库
# 配置前请先设置MySQL连接信息

MYSQL_USER="backup"
MYSQL_PASS="your_password"
MYSQL_HOST="localhost"
BACKUP_DIR="/backup/mysql"
RETENTION_DAYS=30
DATE=$(date '+%Y%m%d')
TIME=$(date '+%H%M')

# 创建备份目录
mkdir -p $BACKUP_DIR/{full,binlog}

# 1. 全量备份
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 备份所有数据库
log "开始MySQL全量备份..."
mysqldump -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS \
    --all-databases \
    --single-transaction \
    --routines \
    --triggers \
    --events \
    --flush-logs \
    --master-data=2 \
    | gzip > $BACKUP_DIR/full/all_dbs_${DATE}_${TIME}.sql.gz

# 检查备份是否成功
if [ ${PIPESTATUS[0]} -eq 0 ]; then
    log "✅ 全量备份成功"
    
    # 2. 备份Binlog(如果开启了二进制日志)
    if mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW BINARY LOGS" &>/dev/null; then
        log "开始备份二进制日志..."
        mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "PURGE BINARY LOGS BEFORE DATE_SUB(NOW(), INTERVAL 7 DAY)"
        cp $(mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW VARIABLES LIKE 'log_bin_basename'" -sN | awk '{print $2}')* $BACKUP_DIR/binlog/ 2>/dev/null
    fi
    
    # 3. 清理旧备份
    find $BACKUP_DIR/full -name "*.sql.gz" -mtime +$RETENTION_DAYS -delete
    find $BACKUP_DIR/binlog -name "mysql-bin.*" -mtime +7 -delete
    
else
    log "❌ 备份失败!"
    exit 1
fi

三、系统维护与诊断类

3.1 自动化安全检查脚本

#!/bin/bash
# 文件名:/opt/scripts/security_audit.sh
# 功能:系统安全基线检查

REPORT_FILE="/var/log/security_audit_$(date '+%Y%m%d').txt"

echo "========== 系统安全审计报告 ==========" > $REPORT_FILE
echo "主机名: $(hostname)" >> $REPORT_FILE
echo "审计时间: $(date)" >> $REPORT_FILE
echo "=====================================" >> $REPORT_FILE

check_item() {
    echo -e "\n[检查项 $1] $2" | tee -a $REPORT_FILE
}

# 1. 检查空密码账户
check_item "1" "检查空密码账户"
awk -F: '($2 == "") {print $1}' /etc/shadow >> $REPORT_FILE

# 2. 检查SUID特殊权限文件
check_item "2" "检查SUID权限文件"
find / -perm -4000 -type f 2>/dev/null | head -20 >> $REPORT_FILE

# 3. 检查最近登录
check_item "3" "最近成功登录"
last -n 10 >> $REPORT_FILE

check_item "4" "最近失败登录"
lastb -n 10 2>/dev/null >> $REPORT_FILE

# 4. 检查ssh配置
check_item "5" "SSH配置检查"
grep -E "^PermitRootLogin|^PasswordAuthentication|^Protocol" /etc/ssh/sshd_config 2>/dev/null >> $REPORT_FILE

# 5. 检查开放端口
check_item "6" "监听端口检查"
if command -v ss &> /dev/null; then
    ss -tulnp | grep LISTEN >> $REPORT_FILE
else
    netstat -tulnp | grep LISTEN >> $REPORT_FILE
fi

# 6. 检查系统服务
check_item "7" "危险服务检查"
for service in telnet vsftpd rsh rexec rlogin; do
    systemctl is-enabled $service 2>/dev/null | grep -q "enabled" && echo "$service 服务已启用" >> $REPORT_FILE
done

echo -e "\n========== 审计完成 ==========" >> $REPORT_FILE
echo "报告已保存至: $REPORT_FILE"

3.2 日志分析脚本(查找异常)

#!/bin/bash
# 文件名:/opt/scripts/log_analyzer.sh
# 功能:分析指定日志文件的异常情况
# 用法:./log_analyzer.sh <日志文件路径>

LOG_FILE=$1
TEMP_FILE="/tmp/log_analysis_$$.tmp"

if [ ! -f "$LOG_FILE" ]; then
    echo "文件不存在: $LOG_FILE"
    exit 1
fi

echo "分析日志文件: $LOG_FILE"
echo "文件大小: $(du -h $LOG_FILE | awk '{print $1}')"
echo "最后修改: $(stat -c %y $LOG_FILE)"
echo "----------------------------------------"

# 1. 提取错误级别日志
echo -e "\n1. 错误级别日志统计:"
grep -i -E "(error|fatal|failed|exception|segmentation fault)" $LOG_FILE | \
    awk '{print $1, $2, $3}' | \
    sort | uniq -c | sort -rn | head -20

# 2. 统计HTTP状态码(针对Web日志)
if echo "$LOG_FILE" | grep -q -E "(access|nginx|apache)"; then
    echo -e "\n2. HTTP状态码分布:"
    awk '{print $9}' $LOG_FILE | sort | uniq -c | sort -rn
fi

# 3. 查找访问频率高的IP
echo -e "\n3. 高频访问IP TOP 10:"
awk '{print $1}' $LOG_FILE | sort | uniq -c | sort -rn | head -10

# 4. 查找可能攻击的URL模式
echo -e "\n4. 可疑请求模式:"
grep -E "(\.\./|select.*from|union.*select|eval\(|base64_decode|shell_exec)" $LOG_FILE | head -10

# 5. 按时间统计请求量
echo -e "\n5. 请求量时间分布:"
if [ -f "$LOG_FILE" ]; then
    awk -F'[ :]' '{print $2":"$3}' $LOG_FILE | sort | uniq -c | tail -24
fi

# 6. 响应时间分析(如果有响应时间字段)
echo -e "\n6. 慢请求统计:"
grep -o 'rt=[0-9]*\.[0-9]*' $LOG_FILE 2>/dev/null | \
    sed 's/rt=//' | \
    awk '{if($1>1) print $1}' | \
    sort -n | \
    awk 'BEGIN{count=0;sum=0} {count++;sum+=$1} END{if(count>0) print "平均响应时间:" sum/count "秒"}'

四、自动化部署与维护

4.1 批量服务器操作脚本

#!/bin/bash
# 文件名:/opt/scripts/batch_operation.sh
# 功能:通过SSH批量在多台服务器上执行命令
# 配置:提前配置SSH免密登录

SERVER_LIST=("server1" "server2" "server3" "192.168.1.100")
COMMAND="$1"
LOG_FILE="/var/log/batch_operation_$(date '+%Y%m%d').log"

if [ -z "$COMMAND" ]; then
    echo "用法: $0 '<要执行的命令>'"
    echo "示例: $0 'df -h'"
    echo "示例: $0 'systemctl restart nginx'"
    exit 1
fi

echo "批量执行命令: $COMMAND" | tee -a $LOG_FILE
echo "开始时间: $(date)" | tee -a $LOG_FILE
echo "======================================" | tee -a $LOG_FILE

for server in "${SERVER_LIST[@]}"; do
    echo -e "\n处理服务器: $server" | tee -a $LOG_FILE
    
    # 检查服务器是否可达
    if ! ping -c 1 -W 2 $server &>/dev/null; then
        echo "❌ 服务器不可达" | tee -a $LOG_FILE
        continue
    fi
    
    # 执行远程命令
    ssh -o ConnectTimeout=5 -o BatchMode=yes $server "$COMMAND" 2>&1 | \
        while IFS= read -r line; do
            echo "[$server] $line" | tee -a $LOG_FILE
        done
    
    # 检查执行结果
    if [ ${PIPESTATUS[0]} -eq 0 ]; then
        echo "✅ 执行成功" | tee -a $LOG_FILE
    else
        echo "❌ 执行失败" | tee -a $LOG_FILE
    fi
done

echo -e "\n======================================" | tee -a $LOG_FILE
echo "完成时间: $(date)" | tee -a $LOG_FILE
echo "详细日志: $LOG_FILE" | tee -a $LOG_FILE

4.2 自动化证书监控和续期检查

#!/bin/bash
# 文件名:/opt/scripts/cert_check.sh
# 功能:检查SSL证书过期时间

DOMAINS=(
    "example.com:443"
    "api.example.com:443"
    "blog.example.com:443"
)

DAYS_WARNING=30  # 提前30天警告

echo "SSL证书过期检查 - $(date)"
echo "================================"

for domain_info in "${DOMAINS[@]}"; do
    domain=$(echo $domain_info | cut -d: -f1)
    port=$(echo $domain_info | cut -d: -f2)
    
    # 获取证书信息
    cert_info=$(echo | openssl s_client -servername $domain -connect $domain:$port 2>/dev/null | openssl x509 -noout -dates 2>/dev/null)
    
    if [ -z "$cert_info" ]; then
        echo "❌ $domain - 无法获取证书信息"
        continue
    fi
    
    # 提取过期时间
    not_after=$(echo "$cert_info" | grep 'notAfter' | cut -d= -f2)
    expire_date=$(date -d "$not_after" '+%Y-%m-%d')
    
    # 计算剩余天数
    expire_timestamp=$(date -d "$not_after" '+%s')
    current_timestamp=$(date '+%s')
    days_left=$(( (expire_timestamp - current_timestamp) / 86400 ))
    
    # 输出结果
    if [ $days_left -lt 0 ]; then
        echo "❌ $domain - 证书已过期 $((0 - days_left)) 天! ($expire_date)"
    elif [ $days_left -lt $DAYS_WARNING ]; then
        echo "⚠️  $domain - 证书将在 $days_left 天后过期 ($expire_date)"
    else
        echo "✅ $domain - 证书有效,剩余 $days_left 天 ($expire_date)"
    fi
done

五、性能分析与优化

5.1 系统性能快照

#!/bin/bash
# 文件名:/opt/scripts/performance_snapshot.sh
# 功能:一次性收集系统性能快照

SNAPSHOT_DIR="/var/log/performance_snapshots"
mkdir -p $SNAPSHOT_DIR

SNAPSHOT_FILE="$SNAPSHOT_DIR/snapshot_$(date '+%Y%m%d_%H%M%S').txt"

echo "性能快照 - $(date)" > $SNAPSHOT_FILE
echo "================================" >> $SNAPSHOT_FILE

# 1. 系统基本信息
echo -e "\n1. 系统基本信息:" >> $SNAPSHOT_FILE
echo "主机名: $(hostname)" >> $SNAPSHOT_FILE
echo "内核版本: $(uname -r)" >> $SNAPSHOT_FILE
echo "运行时间: $(uptime -p)" >> $SNAPSHOT_FILE

# 2. CPU信息
echo -e "\n2. CPU信息:" >> $SNAPSHOT_FILE
echo "CPU型号: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)" >> $SNAPSHOT_FILE
echo "CPU核心数: $(grep -c 'processor' /proc/cpuinfo)" >> $SNAPSHOT_FILE
echo "当前负载: $(uptime | awk -F'load average:' '{print $2}')" >> $SNAPSHOT_FILE
echo "CPU使用率:" >> $SNAPSHOT_FILE
top -bn1 | grep "Cpu(s)" >> $SNAPSHOT_FILE

# 3. 内存信息
echo -e "\n3. 内存信息:" >> $SNAPSHOT_FILE
free -h >> $SNAPSHOT_FILE
echo -e "\n内存占用前10进程:" >> $SNAPSHOT_FILE
ps aux --sort=-%mem | head -11 >> $SNAPSHOT_FILE

# 4. 磁盘信息
echo -e "\n4. 磁盘信息:" >> $SNAPSHOT_FILE
df -h >> $SNAPSHOT_FILE
echo -e "\n磁盘IO统计:" >> $SNAPSHOT_FILE
iostat -dx 1 2 2>/dev/null || echo "iostat未安装" >> $SNAPSHOT_FILE

# 5. 网络信息
echo -e "\n5. 网络连接:" >> $SNAPSHOT_FILE
if command -v ss &> /dev/null; then
    ss -s >> $SNAPSHOT_FILE
else
    netstat -s | head -20 >> $SNAPSHOT_FILE
fi

echo -e "\n6. 进程数统计:" >> $SNAPSHOT_FILE
ps aux --no-headers | wc -l >> $SNAPSHOT_FILE

echo "快照已保存至: $SNAPSHOT_FILE"

六、定时任务配置示例

# crontab -e 添加以下内容

# 每5分钟检查系统状态
*/5 * * * * /opt/scripts/system_monitor.sh > /dev/null 2>&1

# 每天凌晨2点执行备份
0 2 * * * /opt/scripts/smart_backup.sh

# 每周日凌晨3点执行全量备份
0 3 * * 0 /opt/scripts/mysql_backup.sh

# 每天凌晨4点清理日志
0 4 * * * find /var/log -name "*.log" -mtime +7 -delete

# 每小时检查证书
0 * * * * /opt/scripts/cert_check.sh | mail -s "证书检查报告" admin@example.com

# 每月1号凌晨执行安全审计
0 0 1 * * /opt/scripts/security_audit.sh

使用建议

1. 脚本部署步骤

# 1. 创建脚本目录
sudo mkdir -p /opt/scripts

# 2. 复制脚本并设置权限
sudo cp *.sh /opt/scripts/
sudo chmod +x /opt/scripts/*.sh

# 3. 创建日志目录
sudo mkdir -p /var/log/scripts

2. 调试脚本

# 语法检查
bash -n script.sh

# 详细执行(调试模式)
bash -x script.sh

# 记录执行日志
./script.sh 2>&1 | tee /var/log/script_exec.log

3. 安全注意事项

  1. 脚本中避免硬编码密码,使用配置文件或环境变量
  2. 关键脚本设置只读权限:chmod 400 sensitive_script.sh
  3. 定期审计脚本内容
  4. 重要操作前添加确认提示

这些脚本经过生产环境验证,可以根据实际需求进行调整。建议先在小范围测试环境验证后再部署到生产环境。

总结

以上为个人经验,希望能给大家一个参考,也希望大家多多支持脚本之家。

您可能感兴趣的文章:
阅读全文