Skip to content

Lab 10: vLLM Deployment

Advanced Due: 2026-05-12
  • Install vLLM and configure the environment on the DGX H100 server
  • Deploy DeepSeek-Coder-V2-Lite (16B) and serve via OpenAI-compatible API
  • Run performance benchmarks measuring Throughput, TTFT, and TBT metrics
  • DGX H100 SSH access (see Lab 01)
  • NVIDIA driver and CUDA 12.1+ confirmed: nvidia-smi
  • Hugging Face account and access token (HF_TOKEN)
  1. Create a Virtual Environment

    Terminal window
    # On the DGX server
    python3 -m venv ~/vllm-env
    source ~/vllm-env/bin/activate
  2. Install vLLM

    Terminal window
    pip install vllm==0.4.3
    # Check CUDA version before installing — H100 uses CUDA 12.x
    pip install torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
  3. Log in to Hugging Face

    Terminal window
    pip install huggingface_hub
    huggingface-cli login --token $HF_TOKEN
  4. Download the Model

    Terminal window
    # DeepSeek-Coder-V2-Lite-Instruct (16B, ~32GB)
    huggingface-cli download \
    deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
    --local-dir ~/models/deepseek-coder-v2-lite \
    --local-dir-use-symlinks False
start_server.sh
#!/usr/bin/env bash
MODEL_PATH="$HOME/models/deepseek-coder-v2-lite"
PORT=8000
GPU_UTIL=0.90 # GPU memory utilization
MAX_MODEL_LEN=32768 # Maximum context length
python -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--dtype bfloat16 \
--tensor-parallel-size 2 \
--gpu-memory-utilization $GPU_UTIL \
--max-model-len $MAX_MODEL_LEN \
--port $PORT \
--host 0.0.0.0 \
--served-model-name deepseek-coder-v2 \
--trust-remote-code \
2>&1 | tee vllm_server.log
Terminal window
# Run in background
nohup bash start_server.sh &
# Wait for server to be ready
until curl -s http://localhost:8000/health > /dev/null; do
echo "Waiting for server to start..."
sleep 5
done
echo "Server ready"
test_api.py
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="not-required" # vLLM requires no authentication by default
)
# Basic code generation test
response = client.chat.completions.create(
model="deepseek-coder-v2",
messages=[
{
"role": "user",
"content": "Implement quicksort in Python with type hints."
}
],
max_tokens=512,
temperature=0.1
)
print(response.choices[0].message.content)
print(f"\nTokens: {response.usage}")
benchmark.py
import time
import asyncio
import statistics
from dataclasses import dataclass
from openai import AsyncOpenAI
@dataclass
class RequestMetrics:
prompt_tokens: int
completion_tokens: int
ttft_ms: float # Time To First Token
total_ms: float # Total response time
throughput_tps: float # Tokens Per Second
@dataclass
class BenchmarkResult:
total_requests: int
concurrency: int
avg_ttft_ms: float
p50_ttft_ms: float
p99_ttft_ms: float
avg_throughput_tps: float
total_throughput_tps: float
success_rate: float
def print_report(self):
print(f"""
========== vLLM Benchmark Results ==========
Total requests: {self.total_requests}
Concurrent requests: {self.concurrency}
Success rate: {self.success_rate:.1%}
--- Latency ---
TTFT P50: {self.p50_ttft_ms:.1f}ms
TTFT P99: {self.p99_ttft_ms:.1f}ms
TTFT Average: {self.avg_ttft_ms:.1f}ms
--- Throughput ---
Single request TPS: {self.avg_throughput_tps:.1f} tokens/sec
Total TPS: {self.total_throughput_tps:.1f} tokens/sec
=============================================
""")
async def single_request(
client: AsyncOpenAI,
prompt: str,
max_tokens: int = 256
) -> RequestMetrics | None:
start = time.perf_counter()
first_token_time = None
total_tokens = 0
try:
async with client.chat.completions.with_streaming_response.create(
model="deepseek-coder-v2",
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
stream=True
) as resp:
async for chunk in resp.parse(): # type: ignore
if first_token_time is None and chunk.choices:
first_token_time = time.perf_counter()
if chunk.usage:
total_tokens = chunk.usage.completion_tokens
end = time.perf_counter()
ttft = (first_token_time - start) * 1000 if first_token_time else 0
total_ms = (end - start) * 1000
tps = total_tokens / (end - start) if (end - start) > 0 else 0
return RequestMetrics(
prompt_tokens=0,
completion_tokens=total_tokens,
ttft_ms=ttft,
total_ms=total_ms,
throughput_tps=tps
)
except Exception as e:
print(f"Request failed: {e}")
return None
async def run_benchmark(
prompts: list[str],
concurrency: int = 4
) -> BenchmarkResult:
client = AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="not-required"
)
semaphore = asyncio.Semaphore(concurrency)
async def bounded_request(prompt: str):
async with semaphore:
return await single_request(client, prompt)
start_total = time.perf_counter()
results = await asyncio.gather(*[bounded_request(p) for p in prompts])
end_total = time.perf_counter()
valid = [r for r in results if r is not None]
ttfts = [r.ttft_ms for r in valid]
tps_list = [r.throughput_tps for r in valid]
total_tokens = sum(r.completion_tokens for r in valid)
return BenchmarkResult(
total_requests=len(prompts),
concurrency=concurrency,
avg_ttft_ms=statistics.mean(ttfts) if ttfts else 0,
p50_ttft_ms=statistics.median(ttfts) if ttfts else 0,
p99_ttft_ms=sorted(ttfts)[int(len(ttfts) * 0.99)] if ttfts else 0,
avg_throughput_tps=statistics.mean(tps_list) if tps_list else 0,
total_throughput_tps=total_tokens / (end_total - start_total),
success_rate=len(valid) / len(prompts)
)
BENCHMARK_PROMPTS = [
"Implement a binary search tree in Python.",
"Create a simple REST API with FastAPI.",
"Explain the types of SQL JOINs with examples.",
"Explain async/await in asynchronous programming.",
"Explain the difference between Docker containers and virtual machines.",
"What is the difference between Python generators and iterators?",
"Explain the 6 principles of RESTful API design.",
"What are the methods to prevent overfitting in machine learning?",
] * 4 # 32 requests
if __name__ == "__main__":
for concurrency in [1, 4, 8]:
print(f"\nConcurrent requests: {concurrency}")
result = asyncio.run(run_benchmark(BENCHMARK_PROMPTS, concurrency))
result.print_report()
Terminal window
export ANTHROPIC_BASE_URL="http://localhost:8000/v1"
export ANTHROPIC_MODEL="deepseek-coder-v2"
# local_agent.py — Run an agent with a local model
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="x")
def ask_coder(task: str) -> str:
response = client.chat.completions.create(
model="deepseek-coder-v2",
messages=[
{"role": "system", "content": "You are an expert Python developer."},
{"role": "user", "content": task}
],
max_tokens=2048,
temperature=0.1
)
return response.choices[0].message.content

Submit a PR to assignments/lab-10/[student-id]/:

  • start_server.sh — vLLM server startup script
  • test_api.py — Basic API call test
  • benchmark.py — Complete benchmark script
  • benchmark_results.json — Actual measured results for concurrency 1/4/8
  • local_agent.py — Local vLLM-based agent
  • vllm_server.log — Server startup log (first 100 lines)
  • README.md — Installation process, benchmark result analysis, performance/cost comparison vs. Claude API