Skip to content

Commit

Permalink
Merge pull request #315 from dtrifiro/sync-with-0.7.2
Browse files Browse the repository at this point in the history
  • Loading branch information
dtrifiro authored Feb 7, 2025
2 parents 30823b6 + 6ec0863 commit 1da3720
Show file tree
Hide file tree
Showing 1,096 changed files with 10,632 additions and 3,854 deletions.
6 changes: 4 additions & 2 deletions .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# SPDX-License-Identifier: Apache-2.0

import os
import sys
import zipfile

# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))


def print_top_10_largest_files(zip_file):
Expand Down
2 changes: 2 additions & 0 deletions .buildkite/generate_index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import os

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.6353
- name: "exact_match,flexible-extract"
value: 0.637
limit: null
num_fewshot: null
1 change: 1 addition & 0 deletions .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
from pathlib import Path
Expand Down
2 changes: 2 additions & 0 deletions .buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse

from transformers import AutoTokenizer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import json
from pathlib import Path
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

from lmdeploy.serve.openai.api_client import APIClient

api_client = APIClient("http://localhost:8000")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import datetime
import json
import os
Expand Down
4 changes: 2 additions & 2 deletions .buildkite/run-gh200-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic.py
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
'
8 changes: 6 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
- tests/standalone_tests/lazy_torch_compile.py
- tests/standalone_tests/lazy_imports.py
commands:
- python3 standalone_tests/lazy_torch_compile.py
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
Expand Down Expand Up @@ -128,6 +128,7 @@ steps:
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- examples/offline_inference/rlhf.py
- examples/offline_inference/ray_placement.py
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
Expand All @@ -136,6 +137,7 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py

- label: Metrics, Tracing Test # 10min
num_gpus: 2
Expand Down Expand Up @@ -349,6 +351,7 @@ steps:
- vllm/
- tests/models
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py

Expand Down Expand Up @@ -485,6 +488,7 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
Expand Down
6 changes: 5 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,14 @@ repos:
language: system
verbose: true
stages: [commit-msg]
- id: check-spdx-header
name: Check SPDX headers
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false

2 changes: 1 addition & 1 deletion CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ representative at an online or offline/IRL event.

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
channel in the [vLLM Slack](https://slack.vllm.ai).
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=300
ARG VLLM_MAX_SIZE_MB=400
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ubi
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install \
"https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl"
"https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"

ENV HF_HUB_OFFLINE=1 \
HOME=/home/vllm \
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
</h3>

<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>

---
Expand All @@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.

Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.

vLLM is fast with:

Expand Down Expand Up @@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us

* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For coordinating contributions and development, please use Slack.
* For discussing with fellow users and coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
import sys
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_guided.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark guided decoding throughput."""
import argparse
import dataclasses
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_long_document_qa_throughput.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
Offline benchmark to test the long document QA throughput.
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
Benchmark the efficiency of prefix caching.
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_prioritization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline prioritization."""
import argparse
import dataclasses
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_serving_guided.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
r"""Benchmark online serving throughput with guided decoding.
On the server side, run one of the following commands:
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import argparse
import dataclasses
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/cutlass_benchmarks/sparse_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import copy
import itertools
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/cutlass_benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

# Cutlass bench utils
from typing import Iterable, Tuple

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import copy
import itertools
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/cutlass_benchmarks/weight_shapes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import os

import aiohttp
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/disagg_benchmarks/round_robin_proxy.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import asyncio
import itertools

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/disagg_benchmarks/visualize_benchmark_results.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import json

import matplotlib.pyplot as plt
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/fused_kernels/layernorm_rms_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import pickle as pkl
import time
from dataclasses import dataclass
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_aqlm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import os
import sys
from typing import Optional
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_layernorm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import time

import torch
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_lora.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import copy
import json
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_machete.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import copy
import itertools
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_marlin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

from typing import List

import torch
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import time
from datetime import datetime
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import random
import time
from typing import List, Optional
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_quant.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import time

import torch
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_rmsnorm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import itertools
from typing import Optional, Tuple, Union

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

from itertools import accumulate
from typing import List, Optional

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/benchmark_shapes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]],
"mistralai/Mistral-7B-v0.1/TP1": [
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/graph_machete_bench.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import math
import pickle
import re
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import dataclasses
from typing import Any, Callable, Iterable, Optional

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/kernels/weight_shapes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/overheads/benchmark_hashing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import cProfile
import pstats

Expand Down
1 change: 1 addition & 0 deletions cmake/hipify.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0

#
# A command line tool for running pytorch's hipify preprocessor on CUDA
Expand Down
2 changes: 2 additions & 0 deletions collect_env.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

Expand Down
3 changes: 3 additions & 0 deletions csrc/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping);

void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping);

void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
Expand Down
Loading

0 comments on commit 1da3720

Please sign in to comment.