👇🏻 Below is a complete example of a repository level code completion task: :: click to expand ::
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
device = "cuda" # the device to load the model onto
# Now you do not need to add "trust_remote_code=True"
TOKENIZER = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
MODEL = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
# tokenize the input into tokens
input_text = """<|repo_name|>library-system
<|file_sep|>library.py
class Book:
def __init__(self, title, author, isbn, copies):
self.title = title
self.author = author
self.isbn = isbn
self.copies = copies
def __str__(self):
return f"Title: {self.title}, Author: {self.author}, ISBN: {self.isbn}, Copies: {self.copies}"
class Library:
def __init__(self):
self.books = []
def add_book(self, title, author, isbn, copies):
book = Book(title, author, isbn, copies)
self.books.append(book)
def find_book(self, isbn):
for book in self.books:
if book.isbn == isbn:
return book
return None
def list_books(self):
return self.books
<|file_sep|>student.py
class Student:
def __init__(self, name, id):
self.name = name
self.id = id
self.borrowed_books = []
def borrow_book(self, book, library):
if book and book.copies > 0:
self.borrowed_books.append(book)
book.copies -= 1
return True
return False
def return_book(self, book, library):
if book in self.borrowed_books:
self.borrowed_books.remove(book)
book.copies += 1
return True
return False
<|file_sep|>main.py
from library import Library
from student import Student
def main():
# Set up the library with some books
library = Library()
library.add_book("The Great Gatsby", "F. Scott Fitzgerald", "1234567890", 3)
library.add_book("To Kill a Mockingbird", "Harper Lee", "1234567891", 2)
# Set up a student
student = Student("Alice", "S1")
# Student borrows a book
"""
model_inputs = TOKENIZER([input_text], return_tensors="pt").to(device)
# Use `max_new_tokens` to control the maximum output length.
generated_ids = MODEL.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False)[0]
# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
output_text = TOKENIZER.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
print(f"Prompt: \n{input_text}\n\nGenerated text: \n{output_text}")
```
The expected output as following:
```python
Generated text:
book = library.find_book("1234567890")
if student.borrow_book(book, library):
print(f"{student.name} borrowed {book.title}")
else:
print(f"{student.name} could not borrow {book.title}")
# Student returns a book
if student.return_book(book, library):
print(f"{student.name} returned {book.title}")
else:
print(f"{student.name} could not return {book.title}")
# List all books in the library
print("All books in the library:")
for book in library.list_books():
print(book)
if __name__ == "__main__":
main()
```
### 👉🏻 Deploying Qwen2.5-Coder with vLLM
As a family member of Qwen2.5, Qwen2.5-Coder are supported by vLLM. The detail tutorial could be found in [Qwen tutorial](https://qwen.readthedocs.io/en/latest/deployment/vllm.html).
Here, we give you an simple example of offline batched inference in vLLM.
#### Offline Batched Inference
```python
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
# Pass the default decoding hyperparameters of Qwen1.5-32B-Chat
# max_tokens is for the maximum length for generation.
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=1024)
# Input the model name or path. Can be GPTQ or AWQ models.
llm = LLM(model="Qwen/Qwen2.5-Coder-32B")
# Prepare your prompts
prompt = "#write a quick sort algorithm.\ndef quick_sort("
# generate outputs
outputs = llm.generate([prompt], sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
#### Multi-GPU Distributed Serving
To scale up your serving throughputs, distributed serving helps you by leveraging more GPU devices.
When using ultra-long sequences for inference, it might cause insufficient GPU memory. Here, we demonstrate how to run Qwen2.5-Coder-32B with tensor parallelism just by passing in the argument `tensor_parallel_size`.
```python
llm = LLM(model="Qwen/Qwen2.5-Coder-32B", tensor_parallel_size=8)
```
### 👉🏻 Gradio interface 🤗
We also provide a Gradio