DronePlanning/start_all.sh

#!/bin/bash

# ==============================================================================
# 无人机自然语言控制项目 - 一键启动脚本
# ==============================================================================
# 功能：启动所有必需的服务（llama-server推理模型、embedding模型、FastAPI后端）
# 用法：./start_all.sh [选项]
# ==============================================================================

set -e  # 遇到错误立即退出

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# 默认配置（可通过环境变量覆盖）
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LLAMA_SERVER_DIR="${LLAMA_SERVER_DIR:-~/llama.cpp/build/bin}"
INFERENCE_MODEL="${INFERENCE_MODEL:-~/models/gguf/Qwen/Qwen3-4B/Qwen3-4B-Q5_K_M.gguf}"
EMBEDDING_MODEL="${EMBEDDING_MODEL:-~/models/gguf/Qwen/Qwen3-Embedding-4B/Qwen3-Embedding-4B-Q4_K_M.gguf}"
VENV_PATH="${VENV_PATH:-${PROJECT_ROOT}/backend_service/venv}"
LOG_DIR="${PROJECT_ROOT}/logs"
PID_FILE="${LOG_DIR}/services.pid"

# 端口配置
INFERENCE_PORT=8081
EMBEDDING_PORT=8090
API_PORT=8000

# 创建日志目录
mkdir -p "${LOG_DIR}"

# ==============================================================================
# 辅助函数
# ==============================================================================

print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 检查命令是否存在
check_command() {
    if ! command -v "$1" &> /dev/null; then
        print_error "$1 命令未找到，请先安装"
        return 1
    fi
    return 0
}

# 检查端口是否被占用
check_port() {
    local port=$1
    if lsof -Pi :${port} -sTCP:LISTEN -t >/dev/null 2>&1 ; then
        return 0  # 端口被占用
    else
        return 1  # 端口空闲
    fi
}

# 等待服务就绪
wait_for_service() {
    local url=$1
    local service_name=$2
    local max_attempts=30
    local attempt=0

    print_info "等待 ${service_name} 启动..."
    while [ $attempt -lt $max_attempts ]; do
        if curl -s "${url}" > /dev/null 2>&1; then
            print_success "${service_name} 已就绪"
            return 0
        fi
        attempt=$((attempt + 1))
        sleep 1
    done

    print_error "${service_name} 启动超时"
    return 1
}

# 停止所有服务
stop_services() {
    print_info "正在停止所有服务..."

    if [ -f "${PID_FILE}" ]; then
        while read pid; do
            if ps -p $pid > /dev/null 2>&1; then
                print_info "停止进程 PID: $pid"
                kill $pid 2>/dev/null || true
            fi
        done < "${PID_FILE}"
        rm -f "${PID_FILE}"
    fi

    # 尝试通过端口停止服务
    for port in ${INFERENCE_PORT} ${EMBEDDING_PORT} ${API_PORT}; do
        if check_port ${port}; then
            local pid=$(lsof -ti:${port})
            if [ ! -z "$pid" ]; then
                print_info "停止占用端口 ${port} 的进程 (PID: $pid)"
                kill $pid 2>/dev/null || true
            fi
        fi
    done

    print_success "所有服务已停止"
}

# 清理函数（脚本退出时调用）
cleanup() {
    if [ "$?" -ne 0 ]; then
        print_error "启动过程中发生错误，正在清理..."
    fi
    # 注意：这里不自动停止服务，让用户手动控制
}

trap cleanup EXIT

# ==============================================================================
# 主函数
# ==============================================================================

start_services() {
    print_info "=========================================="
    print_info "  无人机自然语言控制项目 - 服务启动"
    print_info "=========================================="
    echo ""

    # 检查必要的命令
    print_info "检查必要的命令..."
    check_command "python3" || exit 1
    check_command "curl" || exit 1
    check_command "lsof" || print_warning "lsof 未安装，将无法检查端口占用"
    echo ""

    # 检查端口占用
    print_info "检查端口占用..."
    if check_port ${INFERENCE_PORT}; then
        print_warning "端口 ${INFERENCE_PORT} 已被占用，推理模型可能已在运行"
    fi
    if check_port ${EMBEDDING_PORT}; then
        print_warning "端口 ${EMBEDDING_PORT} 已被占用，Embedding模型可能已在运行"
    fi
    if check_port ${API_PORT}; then
        print_error "端口 ${API_PORT} 已被占用，请先停止占用该端口的服务"
        exit 1
    fi
    echo ""

    # 检查llama-server（展开路径中的 ~）
    local llama_server_dir_expanded=$(eval echo "${LLAMA_SERVER_DIR}")
    local llama_server="${llama_server_dir_expanded}/llama-server"
    if [ ! -f "${llama_server}" ]; then
        print_error "llama-server 未找到: ${llama_server}"
        print_info "请设置 LLAMA_SERVER_DIR 环境变量指向正确的路径"
        print_info "当前路径: ${LLAMA_SERVER_DIR}"
        print_info "展开后路径: ${llama_server_dir_expanded}"
        exit 1
    fi
    print_success "找到 llama-server: ${llama_server}"
    echo ""

    # 检查模型文件
    local inference_model_expanded=$(eval echo "${INFERENCE_MODEL}")
    local embedding_model_expanded=$(eval echo "${EMBEDDING_MODEL}")

    if [ ! -f "${inference_model_expanded}" ]; then
        print_error "推理模型文件未找到: ${inference_model_expanded}"
        print_info "请设置 INFERENCE_MODEL 环境变量指向正确的模型路径"
        exit 1
    fi
    print_success "找到推理模型: ${inference_model_expanded}"

    if [ ! -f "${embedding_model_expanded}" ]; then
        print_error "Embedding模型文件未找到: ${embedding_model_expanded}"
        print_info "请设置 EMBEDDING_MODEL 环境变量指向正确的模型路径"
        exit 1
    fi
    print_success "找到Embedding模型: ${embedding_model_expanded}"
    echo ""

    # 检查ROS2环境
    local ros2_setup="${PROJECT_ROOT}/install/setup.bash"
    if [ ! -f "${ros2_setup}" ]; then
        print_warning "ROS2 setup文件未找到: ${ros2_setup}"
        print_warning "如果项目已与ROS2解耦，可以忽略此警告"
    else
        print_success "找到ROS2 setup文件: ${ros2_setup}"
    fi
    echo ""

    # 检查venv虚拟环境
    local venv_path_expanded=$(eval echo "${VENV_PATH}")
    print_info "检查venv虚拟环境: ${venv_path_expanded}"
    if [ ! -d "${venv_path_expanded}" ]; then
        print_error "venv虚拟环境目录不存在: ${venv_path_expanded}"
        print_info "请先创建venv环境: python3 -m venv ${venv_path_expanded}"
        print_info "然后安装依赖: ${venv_path_expanded}/bin/pip install -r backend_service/requirements.txt"
        exit 1
    fi
    if [ ! -f "${venv_path_expanded}/bin/activate" ]; then
        print_error "venv激活脚本不存在: ${venv_path_expanded}/bin/activate"
        print_error "这看起来不是一个有效的venv环境"
        exit 1
    fi
    print_success "venv虚拟环境存在: ${venv_path_expanded}"
    echo ""

    # 初始化PID文件
    > "${PID_FILE}"

    # ==========================================================================
    # 启动推理模型服务
    # ==========================================================================
    print_info "启动推理模型服务 (端口 ${INFERENCE_PORT})..."
    cd "${llama_server_dir_expanded}"
    nohup ./llama-server \
        -m "${inference_model_expanded}" \
        --port ${INFERENCE_PORT} \
        --gpu-layers 36 \
        --host 0.0.0.0 \
        -c 8192 \
        > "${LOG_DIR}/inference_model.log" 2>&1 &
    local inference_pid=$!
    echo $inference_pid >> "${PID_FILE}"
    print_success "推理模型服务已启动 (PID: $inference_pid)"
    print_info "日志文件: ${LOG_DIR}/inference_model.log"
    echo ""

    # ==========================================================================
    # 启动Embedding模型服务
    # ==========================================================================
    print_info "启动Embedding模型服务 (端口 ${EMBEDDING_PORT})..."
    nohup ./llama-server \
        -m "${embedding_model_expanded}" \
        --gpu-layers 36 \
        --port ${EMBEDDING_PORT} \
        --embeddings \
        --pooling last \
        --host 0.0.0.0 \
        > "${LOG_DIR}/embedding_model.log" 2>&1 &
    local embedding_pid=$!
    echo $embedding_pid >> "${PID_FILE}"
    print_success "Embedding模型服务已启动 (PID: $embedding_pid)"
    print_info "日志文件: ${LOG_DIR}/embedding_model.log"
    echo ""

    # ==========================================================================
    # 等待模型服务就绪
    # ==========================================================================
    print_info "等待模型服务就绪..."
    sleep 3  # 给服务一些启动时间

    # 等待推理模型服务
    if ! wait_for_service "http://localhost:${INFERENCE_PORT}/health" "推理模型服务"; then
        # 如果health端点不存在，尝试检查根路径
        if ! wait_for_service "http://localhost:${INFERENCE_PORT}/v1/models" "推理模型服务"; then
            print_warning "推理模型服务可能未完全就绪，但将继续启动"
        fi
    fi

    # 等待Embedding模型服务
    if ! wait_for_service "http://localhost:${EMBEDDING_PORT}/health" "Embedding模型服务"; then
        if ! wait_for_service "http://localhost:${EMBEDDING_PORT}/v1/models" "Embedding模型服务"; then
            print_warning "Embedding模型服务可能未完全就绪，但将继续启动"
        fi
    fi
    echo ""

    # ==========================================================================
    # 启动FastAPI后端服务
    # ==========================================================================
    print_info "启动FastAPI后端服务 (端口 ${API_PORT})..."
    cd "${PROJECT_ROOT}"

    # 激活venv虚拟环境并启动FastAPI服务
    # 使用bash -c来在新的shell中激活venv环境
    bash -c "
        # 激活ROS2环境（如果存在）
        if [ -f '${ros2_setup}' ]; then
            source '${ros2_setup}'
        fi
        # 激活venv虚拟环境
        source '${venv_path_expanded}/bin/activate' && \
        cd '${PROJECT_ROOT}/backend_service' && \
        uvicorn src.main:app --host 0.0.0.0 --port ${API_PORT}
    " > "${LOG_DIR}/fastapi.log" 2>&1 &
    local api_pid=$!
    echo $api_pid >> "${PID_FILE}"
    print_success "FastAPI服务已启动 (PID: $api_pid)"
    print_info "日志文件: ${LOG_DIR}/fastapi.log"
    echo ""

    # 等待FastAPI服务就绪
    sleep 3
    if wait_for_service "http://localhost:${API_PORT}/docs" "FastAPI服务"; then
        print_success "所有服务已成功启动！"
    else
        print_warning "FastAPI服务可能未完全就绪，请检查日志: ${LOG_DIR}/fastapi.log"
    fi
    echo ""

    # 显示服务访问信息
    print_info "=========================================="
    print_info "  服务启动完成！"
    print_info "=========================================="
    print_info "推理模型API: http://localhost:${INFERENCE_PORT}/v1"
    print_info "Embedding模型API: http://localhost:${EMBEDDING_PORT}/v1"
    print_info "FastAPI后端: http://localhost:${API_PORT}"
    print_info "API文档: http://localhost:${API_PORT}/docs"
    print_info ""
    print_info "日志文件位置:"
    print_info "  - 推理模型: ${LOG_DIR}/inference_model.log"
    print_info "  - Embedding模型: ${LOG_DIR}/embedding_model.log"
    print_info "  - FastAPI服务: ${LOG_DIR}/fastapi.log"
    print_info ""
    print_info "按 Ctrl+C 停止所有服务"
    print_info "=========================================="
    echo ""

    # 设置信号处理，确保Ctrl+C时能清理
    trap 'print_info "\n正在停止服务..."; stop_services; exit 0' INT TERM

    # 等待所有后台进程（保持脚本运行）
    print_info "所有服务正在运行中，查看日志请使用:"
    print_info "  tail -f ${LOG_DIR}/*.log"
    echo ""

    # 等待所有后台进程
    wait
}

# ==============================================================================
# 脚本入口
# ==============================================================================

case "${1:-start}" in
    start)
        start_services
        ;;
    stop)
        stop_services
        ;;
    restart)
        stop_services
        sleep 2
        start_services
        ;;
    status)
        print_info "检查服务状态..."
        if [ -f "${PID_FILE}" ]; then
            print_info "已记录的服务进程:"
            while read pid; do
                if ps -p $pid > /dev/null 2>&1; then
                    print_success "PID $pid: 运行中"
                else
                    print_warning "PID $pid: 已停止"
                fi
            done < "${PID_FILE}"
        else
            print_info "未找到PID文件，服务可能未启动"
        fi
        echo ""
        print_info "端口占用情况:"
        for port in ${INFERENCE_PORT} ${EMBEDDING_PORT} ${API_PORT}; do
            if check_port ${port}; then
                local pid=$(lsof -ti:${port})
                print_success "端口 ${port}: 被占用 (PID: $pid)"
            else
                print_warning "端口 ${port}: 空闲"
            fi
        done
        ;;
    *)
        echo "用法: $0 {start|stop|restart|status}"
        echo ""
        echo "命令说明:"
        echo "  start   - 启动所有服务（默认）"
        echo "  stop    - 停止所有服务"
        echo "  restart - 重启所有服务"
        echo "  status  - 查看服务状态"
        echo ""
        echo "环境变量配置:"
        echo "  LLAMA_SERVER_DIR  - llama-server所在目录 (默认: ~/llama.cpp/build/bin)"
        echo "  INFERENCE_MODEL   - 推理模型路径 (默认: ~/models/gguf/Qwen/Qwen3-4B/Qwen3-4B-Q5_K_M.gguf)"
        echo "  EMBEDDING_MODEL   - Embedding模型路径 (默认: ~/models/gguf/Qwen/Qwen3-Embedding-4B/Qwen3-Embedding-4B-Q4_K_M.gguf)"
        echo "  VENV_PATH         - venv虚拟环境路径 (默认: \${PROJECT_ROOT}/backend_service/venv)"
        exit 1
        ;;
esac