#!bin/bash

# Make non interactive
export DEBIAN_FRONTEND=noninteractive

echo "---------------------------------------------------"
echo "Starting vLLM docker container"
echo "---------------------------------------------------"

# Define the model name and Hugging Face token
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
HF_TOKEN="[insert-hf-token]"

# Get the number of GPUs
NUM_GPUS=$(nvidia-smi -L | wc -l)

# max model len from: https://blog.vllm.ai/2025/04/05/llama4.html
DOCKER_CMD="docker run -d --gpus all \
    -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
    --ipc=host \
    --restart always \
    --env HF_TOKEN=$HF_TOKEN \
    vllm/vllm-openai:latest \
    --tensor-parallel-size $NUM_GPUS \
    --model \"$MODEL_NAME\" \
    --max-model-len 430000"

# Run the Docker command as ubuntu user
echo "Executing Docker command: $DOCKER_CMD"
sudo -u ubuntu bash -c "$DOCKER_CMD"

# Test the API (wait +- 10 minutes for model download and start up)
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
curl -X POST http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "'$MODEL_NAME'",
        "messages": [
            {
                "role": "user",
                "content": "Hello, how are you?"
            }
        ]
    }'