#!bin/bash

# Make non interactive
export DEBIAN_FRONTEND=noninteractive

echo "---------------------------------------------------"
echo "Starting vLLM docker container"
echo "---------------------------------------------------"  

# Construct the Docker run command
MODEL_NAME="Qwen/Qwen2.5-VL-32B-Instruct"
MAX_MODEL_LEN=32768
# Set below max_model_len for a GPU with ~40GB VRAM
# MAX_MODEL_LEN=16384

NUM_GPUS=$(nvidia-smi -L | wc -l)
DOCKER_CMD="docker run -d --gpus all \
    -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \
    -v /home/ubuntu/vllm:/vllm_repo \
    -p 8000:8000 \
    --ipc=host \
    --restart always"

DOCKER_CMD="$DOCKER_CMD \
     vllm/vllm-openai:latest \
    --tensor-parallel-size $NUM_GPUS \
    --model \"$MODEL_NAME\" \
    --max_model_len $MAX_MODEL_LEN"


# Run the Docker command as ubuntu user
echo "Executing Docker command: $DOCKER_CMD"
sudo -u ubuntu bash -c "$DOCKER_CMD"

# Send request 
# wait ~10 minutes for model download and start up
# IMAGE_URL="https://www.hyperstack.cloud/hs-fs/hubfs/deploy-vm-11-ecd8c53003182041d3a2881d0010f6c6-1.png?width=3352&height=1852&name=deploy-vm-11-ecd8c53003182041d3a2881d0010f6c6-1.png"
# MODEL_NAME="Qwen/Qwen2.5-VL-32B-Instruct"
# curl -X POST http://localhost:8000/v1/chat/completions \
#     -H "Content-Type: application/json" \
#     -d '{
#         "model": "'$MODEL_NAME'",
#         "messages": [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": "Describe this image in two sentences"
#                     },
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": "'$IMAGE_URL'"
#                         }
#                     }
#                 ]
#             }
#         ]
#     }'