Tensorize vLLM Model#

Source vllm-project/vllm.

  1import argparse
  2import dataclasses
  3import json
  4import os
  5import uuid
  6from functools import partial
  7
  8from tensorizer import stream_io
  9
 10from vllm import LLM
 11from vllm.distributed import (init_distributed_environment,
 12                              initialize_model_parallel)
 13from vllm.engine.arg_utils import EngineArgs
 14from vllm.engine.llm_engine import LLMEngine
 15from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
 16                                                         TensorizerConfig,
 17                                                         serialize_vllm_model)
 18
 19# yapf conflicts with isort for this docstring
 20# yapf: disable
 21"""
 22tensorize_vllm_model.py is a script that can be used to serialize and 
 23deserialize vLLM models. These models can be loaded using tensorizer 
 24to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
 25or locally. Tensor encryption and decryption is also supported, although 
 26libsodium must be installed to use it. Install vllm with tensorizer support 
 27using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
 28https://github.com/coreweave/tensorizer
 29
 30To serialize a model, install vLLM from source, then run something 
 31like this from the root level of this repository:
 32
 33python -m examples.tensorize_vllm_model \
 34   --model facebook/opt-125m \
 35   serialize \
 36   --serialized-directory s3://my-bucket \
 37   --suffix v1
 38   
 39Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
 40and saves it to your S3 bucket. A local directory can also be used. This
 41assumes your S3 credentials are specified as environment variables
 42in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
 43`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
 44`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
 45as CLI args to this script.
 46
 47You can also encrypt the model weights with a randomly-generated key by 
 48providing a `--keyfile` argument.
 49
 50To deserialize a model, you can run something like this from the root 
 51level of this repository:
 52
 53python -m examples.tensorize_vllm_model \
 54   --model EleutherAI/gpt-j-6B \
 55   --dtype float16 \
 56   deserialize \
 57   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
 58
 59Which downloads the model tensors from your S3 bucket and deserializes them.
 60
 61You can also provide a `--keyfile` argument to decrypt the model weights if 
 62they were serialized with encryption.
 63
 64For more information on the available arguments for serializing, run 
 65`python -m examples.tensorize_vllm_model serialize --help`.
 66
 67Or for deserializing:
 68
 69`python -m examples.tensorize_vllm_model deserialize --help`.
 70
 71Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 72directly to load models:
 73
 74    llm = LLM(model="facebook/opt-125m",
 75              load_format="tensorizer",
 76              model_loader_extra_config=TensorizerConfig(
 77                    tensorizer_uri = path_to_tensors,
 78                    num_readers=3,
 79                    )
 80              )
 81            
 82A serialized model can be used during model loading for the vLLM OpenAI
 83inference server. `model_loader_extra_config` is exposed as the CLI arg
 84`--model-loader-extra-config`, and accepts a JSON string literal of the
 85TensorizerConfig arguments desired.
 86
 87In order to see all of the available arguments usable to configure 
 88loading with tensorizer that are given to `TensorizerConfig`, run:
 89
 90`python -m examples.tensorize_vllm_model deserialize --help`
 91
 92under the `tensorizer options` section. These can also be used for
 93deserialization in this example script, although `--tensorizer-uri` and
 94`--path-to-tensors` are functionally the same in this case.
 95"""
 96
 97
 98def parse_args():
 99    parser = argparse.ArgumentParser(
100        description="An example script that can be used to serialize and "
101        "deserialize vLLM models. These models "
102        "can be loaded using tensorizer directly to the GPU "
103        "extremely quickly. Tensor encryption and decryption is "
104        "also supported, although libsodium must be installed to "
105        "use it.")
106    parser = EngineArgs.add_cli_args(parser)
107    subparsers = parser.add_subparsers(dest='command')
108
109    serialize_parser = subparsers.add_parser(
110        'serialize', help="Serialize a model to `--serialized-directory`")
111
112    serialize_parser.add_argument(
113        "--suffix",
114        type=str,
115        required=False,
116        help=(
117            "The suffix to append to the serialized model directory, which is "
118            "used to construct the location of the serialized model tensors, "
119            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
120            "`--suffix` is `v1`, the serialized model tensors will be "
121            "saved to "
122            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
123            "If none is provided, a random UUID will be used."))
124    serialize_parser.add_argument(
125        "--serialized-directory",
126        type=str,
127        required=True,
128        help="The directory to serialize the model to. "
129        "This can be a local directory or S3 URI. The path to where the "
130        "tensors are saved is a combination of the supplied `dir` and model "
131        "reference ID. For instance, if `dir` is the serialized directory, "
132        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
133        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
134        "where `suffix` is given by `--suffix` or a random UUID if not "
135        "provided.")
136
137    serialize_parser.add_argument(
138        "--keyfile",
139        type=str,
140        required=False,
141        help=("Encrypt the model weights with a randomly-generated binary key,"
142              " and save the key at this path"))
143
144    deserialize_parser = subparsers.add_parser(
145        'deserialize',
146        help=("Deserialize a model from `--path-to-tensors`"
147              " to verify it can be loaded and used."))
148
149    deserialize_parser.add_argument(
150        "--path-to-tensors",
151        type=str,
152        required=True,
153        help="The local path or S3 URI to the model tensors to deserialize. ")
154
155    deserialize_parser.add_argument(
156        "--keyfile",
157        type=str,
158        required=False,
159        help=("Path to a binary key to use to decrypt the model weights,"
160              " if the model was serialized with encryption"))
161
162    TensorizerArgs.add_cli_args(deserialize_parser)
163
164    return parser.parse_args()
165
166
167
168def deserialize():
169    llm = LLM(model=args.model,
170              load_format="tensorizer",
171              model_loader_extra_config=tensorizer_config
172    )
173    return llm
174
175
176
177args = parse_args()
178
179s3_access_key_id = (getattr(args, 's3_access_key_id', None)
180                    or os.environ.get("S3_ACCESS_KEY_ID", None))
181s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
182                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
183s3_endpoint = (getattr(args, 's3_endpoint', None)
184               or os.environ.get("S3_ENDPOINT_URL", None))
185
186credentials = {
187    "s3_access_key_id": s3_access_key_id,
188    "s3_secret_access_key": s3_secret_access_key,
189    "s3_endpoint": s3_endpoint
190}
191
192_read_stream, _write_stream = (partial(
193    stream_io.open_stream,
194    mode=mode,
195    s3_access_key_id=s3_access_key_id,
196    s3_secret_access_key=s3_secret_access_key,
197    s3_endpoint=s3_endpoint,
198) for mode in ("rb", "wb+"))
199
200model_ref = args.model
201
202model_name = model_ref.split("/")[1]
203
204os.environ["MASTER_ADDR"] = "127.0.0.1"
205os.environ["MASTER_PORT"] = "8080"
206
207init_distributed_environment(world_size=1, rank=0, local_rank=0)
208initialize_model_parallel()
209
210keyfile = args.keyfile if args.keyfile else None
211
212
213if args.model_loader_extra_config:
214    config = json.loads(args.model_loader_extra_config)
215    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
216    tensorizer_args.tensorizer_uri = args.path_to_tensors
217else:
218    tensorizer_args = None
219
220if args.command == "serialize":
221    eng_args_dict = {f.name: getattr(args, f.name) for f in
222                     dataclasses.fields(EngineArgs)}
223
224    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
225    engine = LLMEngine.from_engine_args(engine_args)
226
227    input_dir = args.serialized_directory.rstrip('/')
228    suffix = args.suffix if args.suffix else uuid.uuid4().hex
229    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
230    model_path = f"{base_path}/model.tensors"
231    tensorizer_config = TensorizerConfig(
232        tensorizer_uri=model_path,
233        **credentials)
234    serialize_vllm_model(engine, tensorizer_config, keyfile)
235elif args.command == "deserialize":
236    if not tensorizer_args:
237        tensorizer_config = TensorizerConfig(
238            tensorizer_uri=args.path_to_tensors,
239            encryption_keyfile = keyfile,
240            **credentials
241        )
242    deserialize()
243else:
244    raise ValueError("Either serialize or deserialize must be specified.")