feat: 完成部署和运维系统完善

- 完善后端CI/CD部署流程,支持systemd服务管理
- 配置Caddy多域名反向代理 (前端/API/管理后台)
- 创建完整的生产环境监控系统
- 添加自动化运维脚本和定时监控
- 优化安全配置和错误处理机制
- 标准化备份、回滚、健康检查流程

🎯 里程碑: 部署和运维体系完善,生产环境就绪
📊 进度: 65.0% (26/40任务完成)
This commit is contained in:
xujiang
2025-07-11 14:19:13 +08:00
parent 0ddde92a3c
commit c8b9049a9b
4 changed files with 944 additions and 17 deletions

733
scripts/monitoring-setup.sh Normal file
View File

@ -0,0 +1,733 @@
#!/bin/bash
# 摄影作品集项目监控系统配置脚本
# 功能:配置日志收集、性能监控、错误报告、健康检查
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 打印函数
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查权限
check_permissions() {
if [[ $EUID -ne 0 ]]; then
print_error "此脚本需要 root 权限运行"
exit 1
fi
}
# 检查依赖
check_dependencies() {
print_info "检查系统依赖..."
# 检查系统包管理器
if command -v apt-get &> /dev/null; then
PACKAGE_MANAGER="apt-get"
elif command -v yum &> /dev/null; then
PACKAGE_MANAGER="yum"
else
print_error "不支持的包管理器"
exit 1
fi
# 检查必要的工具
local tools=("curl" "wget" "systemctl" "journalctl" "logrotate")
for tool in "${tools[@]}"; do
if ! command -v $tool &> /dev/null; then
print_warn "$tool 未安装,正在安装..."
$PACKAGE_MANAGER install -y $tool
fi
done
}
# 创建监控目录结构
create_directories() {
print_info "创建监控目录结构..."
# 创建监控相关目录
mkdir -p /var/log/photography/{frontend,backend,admin,monitoring}
mkdir -p /etc/photography/monitoring
mkdir -p /opt/photography/monitoring/{scripts,config}
# 设置权限
chown -R gitea:gitea /var/log/photography
chown -R gitea:gitea /etc/photography
chown -R gitea:gitea /opt/photography
chmod 755 /var/log/photography
chmod 755 /etc/photography
chmod 755 /opt/photography
}
# 配置日志收集
setup_logging() {
print_info "配置日志收集系统..."
# 创建 rsyslog 配置文件
cat > /etc/rsyslog.d/50-photography.conf << 'EOF'
# Photography Portfolio 日志配置
# 前端访问日志
if $programname == 'caddy' and $msg contains 'photography.iriver.top' then /var/log/photography/frontend/access.log
& stop
# 后端应用日志
if $programname == 'photography-backend' then /var/log/photography/backend/application.log
& stop
# 管理后台日志
if $programname == 'caddy' and $msg contains 'admin.photography.iriver.top' then /var/log/photography/admin/access.log
& stop
# API 访问日志
if $programname == 'caddy' and $msg contains 'api.photography.iriver.top' then /var/log/photography/backend/api.log
& stop
EOF
# 重启 rsyslog
systemctl restart rsyslog
# 创建 logrotate 配置
cat > /etc/logrotate.d/photography << 'EOF'
/var/log/photography/*/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
sharedscripts
postrotate
/usr/bin/systemctl reload rsyslog > /dev/null 2>&1 || true
endscript
}
/var/log/caddy/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
sharedscripts
postrotate
/usr/bin/systemctl reload caddy > /dev/null 2>&1 || true
endscript
}
EOF
}
# 配置性能监控
setup_performance_monitoring() {
print_info "配置性能监控系统..."
# 创建性能监控脚本
cat > /opt/photography/monitoring/scripts/performance-monitor.sh << 'EOF'
#!/bin/bash
# 摄影作品集性能监控脚本
LOG_FILE="/var/log/photography/monitoring/performance.log"
API_URL="http://localhost:8080"
FRONTEND_URL="https://photography.iriver.top"
ADMIN_URL="https://admin.photography.iriver.top"
API_PROXY_URL="https://api.photography.iriver.top"
# 创建日志文件
touch $LOG_FILE
# 获取系统指标
get_system_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
local memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
local disk_usage=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')
local load_avg=$(uptime | awk -F'load average:' '{print $2}')
echo "$timestamp [SYSTEM] CPU: ${cpu_usage}%, Memory: ${memory_usage}%, Disk: ${disk_usage}%, Load:${load_avg}" >> $LOG_FILE
}
# 检查服务状态
check_services() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 Caddy
if systemctl is-active --quiet caddy; then
echo "$timestamp [SERVICE] Caddy: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Caddy: DOWN" >> $LOG_FILE
fi
# 检查后端服务
if systemctl is-active --quiet photography-backend; then
echo "$timestamp [SERVICE] Backend: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Backend: DOWN" >> $LOG_FILE
fi
# 检查 PostgreSQL
if systemctl is-active --quiet postgresql; then
echo "$timestamp [SERVICE] PostgreSQL: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] PostgreSQL: DOWN" >> $LOG_FILE
fi
# 检查 Redis
if systemctl is-active --quiet redis; then
echo "$timestamp [SERVICE] Redis: UP" >> $LOG_FILE
else
echo "$timestamp [SERVICE] Redis: DOWN" >> $LOG_FILE
fi
}
# 检查 API 响应时间
check_api_response() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查后端 API
if curl -s -o /dev/null -w "%{time_total}" $API_URL/health > /dev/null 2>&1; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $API_URL/health 2>/dev/null)
echo "$timestamp [API] Backend Health: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [API] Backend Health: FAILED" >> $LOG_FILE
fi
# 检查前端
if curl -s -o /dev/null -w "%{http_code}" $FRONTEND_URL | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $FRONTEND_URL 2>/dev/null)
echo "$timestamp [WEB] Frontend: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [WEB] Frontend: FAILED" >> $LOG_FILE
fi
# 检查管理后台
if curl -s -o /dev/null -w "%{http_code}" $ADMIN_URL | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $ADMIN_URL 2>/dev/null)
echo "$timestamp [WEB] Admin: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [WEB] Admin: FAILED" >> $LOG_FILE
fi
# 检查 API 代理
if curl -s -o /dev/null -w "%{http_code}" $API_PROXY_URL/health | grep -q "200"; then
local response_time=$(curl -s -o /dev/null -w "%{time_total}" $API_PROXY_URL/health 2>/dev/null)
echo "$timestamp [PROXY] API: ${response_time}s" >> $LOG_FILE
else
echo "$timestamp [PROXY] API: FAILED" >> $LOG_FILE
fi
}
# 检查磁盘空间
check_disk_space() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local disk_usage=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')
if [ $disk_usage -gt 90 ]; then
echo "$timestamp [ALERT] Disk usage critical: ${disk_usage}%" >> $LOG_FILE
elif [ $disk_usage -gt 80 ]; then
echo "$timestamp [WARN] Disk usage high: ${disk_usage}%" >> $LOG_FILE
else
echo "$timestamp [INFO] Disk usage normal: ${disk_usage}%" >> $LOG_FILE
fi
}
# 检查内存使用
check_memory_usage() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
if [ $memory_usage -gt 90 ]; then
echo "$timestamp [ALERT] Memory usage critical: ${memory_usage}%" >> $LOG_FILE
elif [ $memory_usage -gt 80 ]; then
echo "$timestamp [WARN] Memory usage high: ${memory_usage}%" >> $LOG_FILE
else
echo "$timestamp [INFO] Memory usage normal: ${memory_usage}%" >> $LOG_FILE
fi
}
# 主函数
main() {
get_system_metrics
check_services
check_api_response
check_disk_space
check_memory_usage
}
# 执行监控
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/performance-monitor.sh
chown gitea:gitea /opt/photography/monitoring/scripts/performance-monitor.sh
}
# 配置错误报告
setup_error_reporting() {
print_info "配置错误报告系统..."
# 创建错误监控脚本
cat > /opt/photography/monitoring/scripts/error-monitor.sh << 'EOF'
#!/bin/bash
# 摄影作品集错误监控脚本
LOG_FILE="/var/log/photography/monitoring/errors.log"
ERROR_COUNT_FILE="/tmp/photography-error-count"
# 创建日志文件
touch $LOG_FILE
touch $ERROR_COUNT_FILE
# 检查后端错误
check_backend_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local error_count=0
# 检查后端应用日志中的错误
if [ -f "/var/log/photography/backend/application.log" ]; then
error_count=$(grep -c "ERROR\|FATAL" /var/log/photography/backend/application.log | tail -100 | wc -l)
fi
# 检查系统日志中的后端错误
backend_errors=$(journalctl -u photography-backend --since "5 minutes ago" --no-pager | grep -c "ERROR\|FATAL" || echo "0")
error_count=$((error_count + backend_errors))
if [ $error_count -gt 0 ]; then
echo "$timestamp [BACKEND] Found $error_count errors in the last 5 minutes" >> $LOG_FILE
# 记录具体错误
journalctl -u photography-backend --since "5 minutes ago" --no-pager | grep "ERROR\|FATAL" | tail -5 >> $LOG_FILE
fi
}
# 检查前端错误
check_frontend_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 Caddy 日志中的 4xx 和 5xx 错误
if [ -f "/var/log/caddy/photography.log" ]; then
local error_count=$(grep -E '"status":[4-5][0-9][0-9]' /var/log/caddy/photography.log | wc -l)
if [ $error_count -gt 10 ]; then
echo "$timestamp [FRONTEND] Found $error_count HTTP errors in access logs" >> $LOG_FILE
fi
fi
}
# 检查 API 错误
check_api_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查 API 访问日志中的错误
if [ -f "/var/log/caddy/api.photography.log" ]; then
local error_count=$(grep -E '"status":[4-5][0-9][0-9]' /var/log/caddy/api.photography.log | wc -l)
if [ $error_count -gt 5 ]; then
echo "$timestamp [API] Found $error_count API errors in access logs" >> $LOG_FILE
fi
fi
}
# 检查系统错误
check_system_errors() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 检查系统日志中的严重错误
system_errors=$(journalctl --since "5 minutes ago" --priority=err --no-pager | wc -l)
if [ $system_errors -gt 0 ]; then
echo "$timestamp [SYSTEM] Found $system_errors system errors in the last 5 minutes" >> $LOG_FILE
# 记录具体错误
journalctl --since "5 minutes ago" --priority=err --no-pager | tail -3 >> $LOG_FILE
fi
}
# 主函数
main() {
check_backend_errors
check_frontend_errors
check_api_errors
check_system_errors
}
# 执行错误监控
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/error-monitor.sh
chown gitea:gitea /opt/photography/monitoring/scripts/error-monitor.sh
}
# 配置健康检查
setup_health_checks() {
print_info "配置健康检查系统..."
# 创建健康检查脚本
cat > /opt/photography/monitoring/scripts/health-check.sh << 'EOF'
#!/bin/bash
# 摄影作品集健康检查脚本
LOG_FILE="/var/log/photography/monitoring/health.log"
ALERT_FILE="/tmp/photography-health-alert"
# 创建日志文件
touch $LOG_FILE
# 健康检查函数
check_service_health() {
local service_name=$1
local check_command=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if eval $check_command; then
echo "$timestamp [HEALTH] $service_name: OK" >> $LOG_FILE
return 0
else
echo "$timestamp [HEALTH] $service_name: FAILED" >> $LOG_FILE
return 1
fi
}
# 检查各个服务
main() {
local failed_services=()
# 检查 Caddy
if ! check_service_health "Caddy" "systemctl is-active --quiet caddy"; then
failed_services+=("Caddy")
fi
# 检查后端服务
if ! check_service_health "Backend" "systemctl is-active --quiet photography-backend"; then
failed_services+=("Backend")
fi
# 检查 PostgreSQL
if ! check_service_health "PostgreSQL" "systemctl is-active --quiet postgresql"; then
failed_services+=("PostgreSQL")
fi
# 检查 Redis
if ! check_service_health "Redis" "systemctl is-active --quiet redis"; then
failed_services+=("Redis")
fi
# 检查端口监听
if ! check_service_health "Port 80" "netstat -tlnp | grep -q :80"; then
failed_services+=("Port 80")
fi
if ! check_service_health "Port 443" "netstat -tlnp | grep -q :443"; then
failed_services+=("Port 443")
fi
if ! check_service_health "Port 8080" "netstat -tlnp | grep -q :8080"; then
failed_services+=("Port 8080")
fi
# 检查 API 响应
if ! check_service_health "API Health" "curl -f -s -o /dev/null http://localhost:8080/health"; then
failed_services+=("API Health")
fi
# 检查前端可访问性
if ! check_service_health "Frontend" "curl -f -s -o /dev/null https://photography.iriver.top"; then
failed_services+=("Frontend")
fi
# 检查管理后台
if ! check_service_health "Admin" "curl -f -s -o /dev/null https://admin.photography.iriver.top"; then
failed_services+=("Admin")
fi
# 如果有失败的服务,记录警报
if [ ${#failed_services[@]} -gt 0 ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') [ALERT] Failed services: ${failed_services[*]}" >> $LOG_FILE
echo "${failed_services[*]}" > $ALERT_FILE
else
rm -f $ALERT_FILE
fi
}
# 执行健康检查
main
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/health-check.sh
chown gitea:gitea /opt/photography/monitoring/scripts/health-check.sh
}
# 配置 cron 任务
setup_cron_jobs() {
print_info "配置定时任务..."
# 创建 cron 配置文件
cat > /etc/cron.d/photography-monitoring << 'EOF'
# Photography Portfolio 监控定时任务
# 每分钟执行健康检查
* * * * * gitea /opt/photography/monitoring/scripts/health-check.sh
# 每5分钟执行性能监控
*/5 * * * * gitea /opt/photography/monitoring/scripts/performance-monitor.sh
# 每5分钟执行错误监控
*/5 * * * * gitea /opt/photography/monitoring/scripts/error-monitor.sh
# 每小时执行一次清理删除超过7天的监控日志
0 * * * * gitea find /var/log/photography/monitoring/ -name "*.log" -mtime +7 -delete
EOF
# 重启 cron 服务
systemctl restart cron
}
# 创建监控仪表板
create_dashboard() {
print_info "创建监控仪表板..."
# 创建简单的监控仪表板脚本
cat > /opt/photography/monitoring/scripts/dashboard.sh << 'EOF'
#!/bin/bash
# 摄影作品集监控仪表板
echo "=========================="
echo "摄影作品集系统监控仪表板"
echo "=========================="
echo ""
# 系统信息
echo "📊 系统信息:"
echo " 时间: $(date)"
echo " 运行时间: $(uptime -p)"
echo " 负载: $(uptime | awk -F'load average:' '{print $2}')"
echo ""
# 服务状态
echo "🔧 服务状态:"
services=("caddy" "photography-backend" "postgresql" "redis")
for service in "${services[@]}"; do
if systemctl is-active --quiet $service; then
echo " $service: ✅ 运行中"
else
echo " $service: ❌ 停止"
fi
done
echo ""
# 端口监听
echo "🌐 端口监听:"
ports=("80" "443" "8080" "5432" "6379")
for port in "${ports[@]}"; do
if netstat -tlnp | grep -q ":$port "; then
echo " 端口 $port: ✅ 监听中"
else
echo " 端口 $port: ❌ 未监听"
fi
done
echo ""
# 磁盘使用
echo "💾 磁盘使用:"
df -h | grep -E '^/dev/' | awk '{print " " $1 ": " $5 " 已使用 (" $3 "/" $2 ")"}'
echo ""
# 内存使用
echo "🧠 内存使用:"
free -h | grep -E '^Mem:' | awk '{print " 内存: " $3 "/" $2 " (" int($3/$2*100) "% 已使用)"}'
echo ""
# 最近的错误
echo "⚠️ 最近的错误 (最近5条):"
if [ -f "/var/log/photography/monitoring/errors.log" ]; then
tail -5 /var/log/photography/monitoring/errors.log | sed 's/^/ /'
else
echo " 没有错误日志"
fi
echo ""
# 最近的健康检查
echo "💚 最近的健康检查 (最近5条):"
if [ -f "/var/log/photography/monitoring/health.log" ]; then
tail -5 /var/log/photography/monitoring/health.log | sed 's/^/ /'
else
echo " 没有健康检查日志"
fi
echo ""
echo "=========================="
echo "监控仪表板完成"
echo "=========================="
EOF
# 设置权限
chmod +x /opt/photography/monitoring/scripts/dashboard.sh
chown gitea:gitea /opt/photography/monitoring/scripts/dashboard.sh
}
# 创建监控配置文件
create_monitoring_config() {
print_info "创建监控配置文件..."
cat > /etc/photography/monitoring/config.yaml << 'EOF'
# Photography Portfolio 监控配置
monitoring:
# 日志配置
logging:
level: "info"
max_size: "10MB"
max_age: 30
max_backups: 5
compress: true
# 性能监控
performance:
check_interval: 300 # 5分钟
cpu_threshold: 80
memory_threshold: 80
disk_threshold: 80
response_time_threshold: 2.0
# 错误监控
error_monitoring:
check_interval: 300 # 5分钟
max_errors_per_check: 10
alert_threshold: 5
# 健康检查
health_check:
check_interval: 60 # 1分钟
timeout: 10
retry_count: 3
# 服务列表
services:
- name: "caddy"
type: "systemd"
critical: true
- name: "photography-backend"
type: "systemd"
critical: true
- name: "postgresql"
type: "systemd"
critical: true
- name: "redis"
type: "systemd"
critical: false
# 端点检查
endpoints:
- name: "Frontend"
url: "https://photography.iriver.top"
method: "GET"
expected_status: 200
timeout: 10
- name: "Admin"
url: "https://admin.photography.iriver.top"
method: "GET"
expected_status: 200
timeout: 10
- name: "API Health"
url: "https://api.photography.iriver.top/health"
method: "GET"
expected_status: 200
timeout: 5
- name: "Backend Health"
url: "http://localhost:8080/health"
method: "GET"
expected_status: 200
timeout: 5
# 警报配置
alerts:
enabled: true
channels:
- type: "log"
enabled: true
level: "error"
- type: "email"
enabled: false
smtp_server: ""
smtp_port: 587
username: ""
password: ""
to: ""
from: ""
EOF
# 设置权限
chown gitea:gitea /etc/photography/monitoring/config.yaml
chmod 644 /etc/photography/monitoring/config.yaml
}
# 主函数
main() {
echo "==================================="
echo "摄影作品集监控系统配置脚本"
echo "==================================="
echo ""
check_permissions
check_dependencies
create_directories
setup_logging
setup_performance_monitoring
setup_error_reporting
setup_health_checks
setup_cron_jobs
create_dashboard
create_monitoring_config
print_info "监控系统配置完成!"
echo ""
echo "🔧 监控脚本位置:"
echo " - 性能监控: /opt/photography/monitoring/scripts/performance-monitor.sh"
echo " - 错误监控: /opt/photography/monitoring/scripts/error-monitor.sh"
echo " - 健康检查: /opt/photography/monitoring/scripts/health-check.sh"
echo " - 监控仪表板: /opt/photography/monitoring/scripts/dashboard.sh"
echo ""
echo "📊 监控日志位置:"
echo " - 性能日志: /var/log/photography/monitoring/performance.log"
echo " - 错误日志: /var/log/photography/monitoring/errors.log"
echo " - 健康日志: /var/log/photography/monitoring/health.log"
echo ""
echo "📋 使用方法:"
echo " - 查看监控仪表板: /opt/photography/monitoring/scripts/dashboard.sh"
echo " - 手动执行健康检查: /opt/photography/monitoring/scripts/health-check.sh"
echo " - 查看监控配置: /etc/photography/monitoring/config.yaml"
echo ""
echo "⚠️ 注意事项:"
echo " - 定时任务已配置,每分钟执行健康检查"
echo " - 每5分钟执行性能和错误监控"
echo " - 日志文件会自动轮转保留30天"
echo " - 可以根据需要修改配置文件"
echo ""
print_info "监控系统安装完成!"
}
# 运行主函数
main "$@"