Files
photography/scripts/monitoring-setup.sh
xujiang c8b9049a9b feat: 完成部署和运维系统完善
- 完善后端CI/CD部署流程,支持systemd服务管理
- 配置Caddy多域名反向代理 (前端/API/管理后台)
- 创建完整的生产环境监控系统
- 添加自动化运维脚本和定时监控
- 优化安全配置和错误处理机制
- 标准化备份、回滚、健康检查流程

🎯 里程碑: 部署和运维体系完善,生产环境就绪
📊 进度: 65.0% (26/40任务完成)
2025-07-11 14:19:13 +08:00

733 lines
20 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 摄影作品集项目监控系统配置脚本
# 功能:配置日志收集、性能监控、错误报告、健康检查
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 打印函数
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查权限
check_permissions() {
if [[ $EUID -ne 0 ]]; then
print_error "此脚本需要 root 权限运行"
exit 1
fi
}
# 检查依赖
check_dependencies() {
print_info "检查系统依赖..."
# 检查系统包管理器
if command -v apt-get &> /dev/null; then
PACKAGE_MANAGER="apt-get"
elif command -v yum &> /dev/null; then
PACKAGE_MANAGER="yum"
else
print_error "不支持的包管理器"
exit 1
fi
# 检查必要的工具
local tools=("curl" "wget" "systemctl" "journalctl" "logrotate")
for tool in "${tools[@]}"; do
if ! command -v $tool &> /dev/null; then
print_warn "$tool 未安装,正在安装..."
$PACKAGE_MANAGER install -y $tool
fi
done
}
# 创建监控目录结构
create_directories() {
print_info "创建监控目录结构..."
# 创建监控相关目录
mkdir -p /var/log/photography/{frontend,backend,admin,monitoring}
mkdir -p /etc/photography/monitoring
mkdir -p /opt/photography/monitoring/{scripts,config}
# 设置权限
chown -R gitea:gitea /var/log/photography
chown -R gitea:gitea /etc/photography
chown -R gitea:gitea /opt/photography
chmod 755 /var/log/photography
chmod 755 /etc/photography
chmod 755 /opt/photography
}
# 配置日志收集
setup_logging() {
print_info "配置日志收集系统..."
# 创建 rsyslog 配置文件
cat > /etc/rsyslog.d/50-photography.conf << 'EOF'
# Photography Portfolio 日志配置
# 前端访问日志
if $programname == 'caddy' and $msg contains 'photography.iriver.top' then /var/log/photography/frontend/access.log
& stop
# 后端应用日志
if $programname == 'photography-backend' then /var/log/photography/backend/application.log
& stop
# 管理后台日志
if $programname == 'caddy' and $msg contains 'admin.photography.iriver.top' then /var/log/photography/admin/access.log
& stop
# API 访问日志
if $programname == 'caddy' and $msg contains 'api.photography.iriver.top' then /var/log/photography/backend/api.log
& stop
EOF
# 重启 rsyslog
systemctl restart rsyslog
# 创建 logrotate 配置
cat > /etc/logrotate.d/photography << 'EOF'
/var/log/photography/*/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
sharedscripts
postrotate
/usr/bin/systemctl reload rsyslog > /dev/null 2>&1 || true
endscript
}
/var/log/caddy/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
sharedscripts
postrotate
/usr/bin/systemctl reload caddy > /dev/null 2>&1 || true
endscript
}
EOF
}
# 配置性能监控
setup_performance_monitoring() {
print_info "配置性能监控系统..."
# 创建性能监控脚本
cat > /opt/photography/monitoring/scripts/performance-monitor.sh << 'EOF'
#!/bin/bash
# 摄影作品集性能监控脚本
LOG_FILE="/var/log/photography/monitoring/performance.log"
API_URL="http://localhost:8080"
FRONTEND_URL="https://photography.iriver.top"
ADMIN_URL="https://admin.photography.iriver.top"
API_PROXY_URL="https://api.photography.iriver.top"
# 创建日志文件
touch $LOG_FILE
# 获取系统指标
get_system_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
local memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
local disk_usage=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')
local load_avg=$(uptime | awk -F'load average:' '{print $2}')
echo "$timestamp [SYSTEM] CPU: ${cpu_usage}%, Memory: ${memory_usage}%, Disk: ${disk_usage}%, Load:${load_avg}" >> $LOG_FILE
}
# 检查服务状态
check_services() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 Caddy
if systemctl is-active --quiet caddy; then
echo "$timestamp [SERVICE] Caddy: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Caddy: DOWN" >> $LOG_FILE
fi
# 检查后端服务
if systemctl is-active --quiet photography-backend; then
echo "$timestamp [SERVICE] Backend: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Backend: DOWN" >> $LOG_FILE
fi
# 检查 PostgreSQL
if systemctl is-active --quiet postgresql; then
echo "$timestamp [SERVICE] PostgreSQL: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] PostgreSQL: DOWN" >> $LOG_FILE
fi
# 检查 Redis
if systemctl is-active --quiet redis; then
echo "$timestamp [SERVICE] Redis: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Redis: DOWN" >> $LOG_FILE
fi
}
# 检查 API 响应时间
check_api_response() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查后端 API
if curl -s -o /dev/null -w "%{time_total}" $API_URL/health > /dev/null 2>&1; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $API_URL/health 2>/dev/null)
echo "$timestamp [API] Backend Health: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [API] Backend Health: FAILED" >> $LOG_FILE
fi
# 检查前端
if curl -s -o /dev/null -w "%{http_code}" $FRONTEND_URL | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $FRONTEND_URL 2>/dev/null)
echo "$timestamp [WEB] Frontend: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [WEB] Frontend: FAILED" >> $LOG_FILE
fi
# 检查管理后台
if curl -s -o /dev/null -w "%{http_code}" $ADMIN_URL | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $ADMIN_URL 2>/dev/null)
echo "$timestamp [WEB] Admin: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [WEB] Admin: FAILED" >> $LOG_FILE
fi
# 检查 API 代理
if curl -s -o /dev/null -w "%{http_code}" $API_PROXY_URL/health | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $API_PROXY_URL/health 2>/dev/null)
echo "$timestamp [PROXY] API: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [PROXY] API: FAILED" >> $LOG_FILE
fi
}
# 检查磁盘空间
check_disk_space() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local disk_usage=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')
if [ $disk_usage -gt 90 ]; then
echo "$timestamp [ALERT] Disk usage critical: ${disk_usage}%" >> $LOG_FILE
elif [ $disk_usage -gt 80 ]; then
echo "$timestamp [WARN] Disk usage high: ${disk_usage}%" >> $LOG_FILE
else
echo "$timestamp [INFO] Disk usage normal: ${disk_usage}%" >> $LOG_FILE
fi
}
# 检查内存使用
check_memory_usage() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
if [ $memory_usage -gt 90 ]; then
echo "$timestamp [ALERT] Memory usage critical: ${memory_usage}%" >> $LOG_FILE
elif [ $memory_usage -gt 80 ]; then
echo "$timestamp [WARN] Memory usage high: ${memory_usage}%" >> $LOG_FILE
else
echo "$timestamp [INFO] Memory usage normal: ${memory_usage}%" >> $LOG_FILE
fi
}
# 主函数
main() {
get_system_metrics
check_services
check_api_response
check_disk_space
check_memory_usage
}
# 执行监控
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/performance-monitor.sh
chown gitea:gitea /opt/photography/monitoring/scripts/performance-monitor.sh
}
# 配置错误报告
setup_error_reporting() {
print_info "配置错误报告系统..."
# 创建错误监控脚本
cat > /opt/photography/monitoring/scripts/error-monitor.sh << 'EOF'
#!/bin/bash
# 摄影作品集错误监控脚本
LOG_FILE="/var/log/photography/monitoring/errors.log"
ERROR_COUNT_FILE="/tmp/photography-error-count"
# 创建日志文件
touch $LOG_FILE
touch $ERROR_COUNT_FILE
# 检查后端错误
check_backend_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local error_count=0
# 检查后端应用日志中的错误
if [ -f "/var/log/photography/backend/application.log" ]; then
error_count=$(grep -c "ERROR\|FATAL" /var/log/photography/backend/application.log | tail -100 | wc -l)
fi
# 检查系统日志中的后端错误
backend_errors=$(journalctl -u photography-backend --since "5 minutes ago" --no-pager | grep -c "ERROR\|FATAL" || echo "0")
error_count=$((error_count + backend_errors))
if [ $error_count -gt 0 ]; then
echo "$timestamp [BACKEND] Found $error_count errors in the last 5 minutes" >> $LOG_FILE
# 记录具体错误
journalctl -u photography-backend --since "5 minutes ago" --no-pager | grep "ERROR\|FATAL" | tail -5 >> $LOG_FILE
fi
}
# 检查前端错误
check_frontend_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 Caddy 日志中的 4xx 和 5xx 错误
if [ -f "/var/log/caddy/photography.log" ]; then
local error_count=$(grep -E '"status":[4-5][0-9][0-9]' /var/log/caddy/photography.log | wc -l)
if [ $error_count -gt 10 ]; then
echo "$timestamp [FRONTEND] Found $error_count HTTP errors in access logs" >> $LOG_FILE
fi
fi
}
# 检查 API 错误
check_api_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 API 访问日志中的错误
if [ -f "/var/log/caddy/api.photography.log" ]; then
local error_count=$(grep -E '"status":[4-5][0-9][0-9]' /var/log/caddy/api.photography.log | wc -l)
if [ $error_count -gt 5 ]; then
echo "$timestamp [API] Found $error_count API errors in access logs" >> $LOG_FILE
fi
fi
}
# 检查系统错误
check_system_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查系统日志中的严重错误
system_errors=$(journalctl --since "5 minutes ago" --priority=err --no-pager | wc -l)
if [ $system_errors -gt 0 ]; then
echo "$timestamp [SYSTEM] Found $system_errors system errors in the last 5 minutes" >> $LOG_FILE
# 记录具体错误
journalctl --since "5 minutes ago" --priority=err --no-pager | tail -3 >> $LOG_FILE
fi
}
# 主函数
main() {
check_backend_errors
check_frontend_errors
check_api_errors
check_system_errors
}
# 执行错误监控
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/error-monitor.sh
chown gitea:gitea /opt/photography/monitoring/scripts/error-monitor.sh
}
# 配置健康检查
setup_health_checks() {
print_info "配置健康检查系统..."
# 创建健康检查脚本
cat > /opt/photography/monitoring/scripts/health-check.sh << 'EOF'
#!/bin/bash
# 摄影作品集健康检查脚本
LOG_FILE="/var/log/photography/monitoring/health.log"
ALERT_FILE="/tmp/photography-health-alert"
# 创建日志文件
touch $LOG_FILE
# 健康检查函数
check_service_health() {
local service_name=$1
local check_command=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if eval $check_command; then
echo "$timestamp [HEALTH] $service_name: OK" >> $LOG_FILE
return 0
else
echo "$timestamp [HEALTH] $service_name: FAILED" >> $LOG_FILE
return 1
fi
}
# 检查各个服务
main() {
local failed_services=()
# 检查 Caddy
if ! check_service_health "Caddy" "systemctl is-active --quiet caddy"; then
failed_services+=("Caddy")
fi
# 检查后端服务
if ! check_service_health "Backend" "systemctl is-active --quiet photography-backend"; then
failed_services+=("Backend")
fi
# 检查 PostgreSQL
if ! check_service_health "PostgreSQL" "systemctl is-active --quiet postgresql"; then
failed_services+=("PostgreSQL")
fi
# 检查 Redis
if ! check_service_health "Redis" "systemctl is-active --quiet redis"; then
failed_services+=("Redis")
fi
# 检查端口监听
if ! check_service_health "Port 80" "netstat -tlnp | grep -q :80"; then
failed_services+=("Port 80")
fi
if ! check_service_health "Port 443" "netstat -tlnp | grep -q :443"; then
failed_services+=("Port 443")
fi
if ! check_service_health "Port 8080" "netstat -tlnp | grep -q :8080"; then
failed_services+=("Port 8080")
fi
# 检查 API 响应
if ! check_service_health "API Health" "curl -f -s -o /dev/null http://localhost:8080/health"; then
failed_services+=("API Health")
fi
# 检查前端可访问性
if ! check_service_health "Frontend" "curl -f -s -o /dev/null https://photography.iriver.top"; then
failed_services+=("Frontend")
fi
# 检查管理后台
if ! check_service_health "Admin" "curl -f -s -o /dev/null https://admin.photography.iriver.top"; then
failed_services+=("Admin")
fi
# 如果有失败的服务,记录警报
if [ ${#failed_services[@]} -gt 0 ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') [ALERT] Failed services: ${failed_services[*]}" >> $LOG_FILE
echo "${failed_services[*]}" > $ALERT_FILE
else
rm -f $ALERT_FILE
fi
}
# 执行健康检查
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/health-check.sh
chown gitea:gitea /opt/photography/monitoring/scripts/health-check.sh
}
# 配置 cron 任务
setup_cron_jobs() {
print_info "配置定时任务..."
# 创建 cron 配置文件
cat > /etc/cron.d/photography-monitoring << 'EOF'
# Photography Portfolio 监控定时任务
# 每分钟执行健康检查
* * * * * gitea /opt/photography/monitoring/scripts/health-check.sh
# 每5分钟执行性能监控
*/5 * * * * gitea /opt/photography/monitoring/scripts/performance-monitor.sh
# 每5分钟执行错误监控
*/5 * * * * gitea /opt/photography/monitoring/scripts/error-monitor.sh
# 每小时执行一次清理删除超过7天的监控日志
0 * * * * gitea find /var/log/photography/monitoring/ -name "*.log" -mtime +7 -delete
EOF
# 重启 cron 服务
systemctl restart cron
}
# 创建监控仪表板
create_dashboard() {
print_info "创建监控仪表板..."
# 创建简单的监控仪表板脚本
cat > /opt/photography/monitoring/scripts/dashboard.sh << 'EOF'
#!/bin/bash
# 摄影作品集监控仪表板
echo "=========================="
echo "摄影作品集系统监控仪表板"
echo "=========================="
echo ""
# 系统信息
echo "📊 系统信息:"
echo " 时间: $(date)"
echo " 运行时间: $(uptime -p)"
echo " 负载: $(uptime | awk -F'load average:' '{print $2}')"
echo ""
# 服务状态
echo "🔧 服务状态:"
services=("caddy" "photography-backend" "postgresql" "redis")
for service in "${services[@]}"; do
if systemctl is-active --quiet $service; then
echo " $service: ✅ 运行中"
else
echo " $service: ❌ 停止"
fi
done
echo ""
# 端口监听
echo "🌐 端口监听:"
ports=("80" "443" "8080" "5432" "6379")
for port in "${ports[@]}"; do
if netstat -tlnp | grep -q ":$port "; then
echo " 端口 $port: ✅ 监听中"
else
echo " 端口 $port: ❌ 未监听"
fi
done
echo ""
# 磁盘使用
echo "💾 磁盘使用:"
df -h | grep -E '^/dev/' | awk '{print " " $1 ": " $5 " 已使用 (" $3 "/" $2 ")"}'
echo ""
# 内存使用
echo "🧠 内存使用:"
free -h | grep -E '^Mem:' | awk '{print " 内存: " $3 "/" $2 " (" int($3/$2*100) "% 已使用)"}'
echo ""
# 最近的错误
echo "⚠️ 最近的错误 (最近5条):"
if [ -f "/var/log/photography/monitoring/errors.log" ]; then
tail -5 /var/log/photography/monitoring/errors.log | sed 's/^/ /'
else
echo " 没有错误日志"
fi
echo ""
# 最近的健康检查
echo "💚 最近的健康检查 (最近5条):"
if [ -f "/var/log/photography/monitoring/health.log" ]; then
tail -5 /var/log/photography/monitoring/health.log | sed 's/^/ /'
else
echo " 没有健康检查日志"
fi
echo ""
echo "=========================="
echo "监控仪表板完成"
echo "=========================="
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/dashboard.sh
chown gitea:gitea /opt/photography/monitoring/scripts/dashboard.sh
}
# 创建监控配置文件
create_monitoring_config() {
print_info "创建监控配置文件..."
cat > /etc/photography/monitoring/config.yaml << 'EOF'
# Photography Portfolio 监控配置
monitoring:
# 日志配置
logging:
level: "info"
max_size: "10MB"
max_age: 30
max_backups: 5
compress: true
# 性能监控
performance:
check_interval: 300 # 5分钟
cpu_threshold: 80
memory_threshold: 80
disk_threshold: 80
response_time_threshold: 2.0
# 错误监控
error_monitoring:
check_interval: 300 # 5分钟
max_errors_per_check: 10
alert_threshold: 5
# 健康检查
health_check:
check_interval: 60 # 1分钟
timeout: 10
retry_count: 3
# 服务列表
services:
- name: "caddy"
type: "systemd"
critical: true
- name: "photography-backend"
type: "systemd"
critical: true
- name: "postgresql"
type: "systemd"
critical: true
- name: "redis"
type: "systemd"
critical: false
# 端点检查
endpoints:
- name: "Frontend"
url: "https://photography.iriver.top"
method: "GET"
expected_status: 200
timeout: 10
- name: "Admin"
url: "https://admin.photography.iriver.top"
method: "GET"
expected_status: 200
timeout: 10
- name: "API Health"
url: "https://api.photography.iriver.top/health"
method: "GET"
expected_status: 200
timeout: 5
- name: "Backend Health"
url: "http://localhost:8080/health"
method: "GET"
expected_status: 200
timeout: 5
# 警报配置
alerts:
enabled: true
channels:
- type: "log"
enabled: true
level: "error"
- type: "email"
enabled: false
smtp_server: ""
smtp_port: 587
username: ""
password: ""
to: ""
from: ""
EOF
# 设置权限
chown gitea:gitea /etc/photography/monitoring/config.yaml
chmod 644 /etc/photography/monitoring/config.yaml
}
# 主函数
main() {
echo "==================================="
echo "摄影作品集监控系统配置脚本"
echo "==================================="
echo ""
check_permissions
check_dependencies
create_directories
setup_logging
setup_performance_monitoring
setup_error_reporting
setup_health_checks
setup_cron_jobs
create_dashboard
create_monitoring_config
print_info "监控系统配置完成!"
echo ""
echo "🔧 监控脚本位置:"
echo " - 性能监控: /opt/photography/monitoring/scripts/performance-monitor.sh"
echo " - 错误监控: /opt/photography/monitoring/scripts/error-monitor.sh"
echo " - 健康检查: /opt/photography/monitoring/scripts/health-check.sh"
echo " - 监控仪表板: /opt/photography/monitoring/scripts/dashboard.sh"
echo ""
echo "📊 监控日志位置:"
echo " - 性能日志: /var/log/photography/monitoring/performance.log"
echo " - 错误日志: /var/log/photography/monitoring/errors.log"
echo " - 健康日志: /var/log/photography/monitoring/health.log"
echo ""
echo "📋 使用方法:"
echo " - 查看监控仪表板: /opt/photography/monitoring/scripts/dashboard.sh"
echo " - 手动执行健康检查: /opt/photography/monitoring/scripts/health-check.sh"
echo " - 查看监控配置: /etc/photography/monitoring/config.yaml"
echo ""
echo "⚠️ 注意事项:"
echo " - 定时任务已配置,每分钟执行健康检查"
echo " - 每5分钟执行性能和错误监控"
echo " - 日志文件会自动轮转保留30天"
echo " - 可以根据需要修改配置文件"
echo ""
print_info "监控系统安装完成!"
}
# 运行主函数
main "$@"