diff --git a/CMakeLists.txt b/CMakeLists.txt index edc7989..834c7e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.12) project(qwen.cpp VERSION 0.0.1 LANGUAGES CXX) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib CACHE STRING "") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib CACHE STRING "") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE STRING "") diff --git a/qwen_cpp/openai_api.py b/qwen_cpp/openai_api.py new file mode 100644 index 0000000..8861cd5 --- /dev/null +++ b/qwen_cpp/openai_api.py @@ -0,0 +1,185 @@ +import asyncio +import logging +import time +from typing import List, Literal, Optional, Union + +import qwen_cpp +from fastapi import FastAPI, HTTPException, Request, status +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings +from sse_starlette.sse import EventSourceResponse + +logging.basicConfig(level=logging.INFO, format=r"%(asctime)s - %(module)s - %(levelname)s - %(message)s") + + +class Settings(BaseSettings): + model: str = "qwen14b-ggml.bin" + tiktoken: str = "Qwen-14B-Chat/qwen.tiktoken" + num_threads: int = 0 + + +class ChatMessage(BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +class DeltaMessage(BaseModel): + role: Optional[Literal["system", "user", "assistant"]] = None + content: Optional[str] = None + + +class ChatCompletionRequest(BaseModel): + model: str = "default-model" + messages: List[ChatMessage] + temperature: float = Field(default=0.95, ge=0.0, le=2.0) + top_p: float = Field(default=0.7, ge=0.0, le=1.0) + stream: bool = False + max_tokens: int = Field(default=2048, ge=0) + + model_config = { + "json_schema_extra": {"examples": [{"model": "default-model", "messages": [{"role": "user", "content": "你好"}]}]} + } + + +class ChatCompletionResponseChoice(BaseModel): + index: int = 0 + message: ChatMessage + finish_reason: Literal["stop", "length"] = "stop" + + +class ChatCompletionResponseStreamChoice(BaseModel): + index: int = 0 + delta: DeltaMessage + finish_reason: Optional[Literal["stop", "length"]] = None + + +class ChatCompletionResponse(BaseModel): + id: str = "chatcmpl" + model: str = "default-model" + object: Literal["chat.completion", "chat.completion.chunk"] + created: int = Field(default_factory=lambda: int(time.time())) + choices: Union[List[ChatCompletionResponseChoice], List[ChatCompletionResponseStreamChoice]] + + model_config = { + "json_schema_extra": { + "examples": [ + { + "id": "chatcmpl", + "model": "default-model", + "object": "chat.completion", + "created": 1691166146, + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "你好!"}, + "finish_reason": "stop", + } + ], + } + ] + } + } + + +settings = Settings() +app = FastAPI() +app.add_middleware( + CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] +) +pipeline = qwen_cpp.Pipeline(settings.model,settings.tiktoken) +lock = asyncio.Lock() + + +def stream_chat(history, body): + yield ChatCompletionResponse( + object="chat.completion.chunk", + choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(role="assistant"))], + ) + + for piece in pipeline.chat( + history, + max_length=body.max_tokens, + do_sample=body.temperature > 0, + top_p=body.top_p, + temperature=body.temperature, + num_threads=settings.num_threads, + stream=True, + ): + yield ChatCompletionResponse( + object="chat.completion.chunk", + choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(content=piece))], + ) + + yield ChatCompletionResponse( + object="chat.completion.chunk", + choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(), finish_reason="stop")], + ) + + +async def stream_chat_event_publisher(history, body): + output = "" + try: + async with lock: + for chunk in stream_chat(history, body): + await asyncio.sleep(0) # yield control back to event loop for cancellation check + output += chunk.choices[0].delta.content or "" + yield chunk.model_dump_json(exclude_unset=True) + logging.info(f'prompt: "{history[-1]}", stream response: "{output}"') + except asyncio.CancelledError as e: + logging.info(f'prompt: "{history[-1]}", stream response (partial): "{output}"') + raise e + + +@app.post("/v1/chat/completions") +async def create_chat_completion(body: ChatCompletionRequest) -> ChatCompletionResponse: + # ignore system messages + history = [msg.content for msg in body.messages if msg.role != "system"] + if len(history) % 2 != 1: + raise HTTPException(status.HTTP_400_BAD_REQUEST, "invalid history size") + + if body.stream: + generator = stream_chat_event_publisher(history, body) + return EventSourceResponse(generator) + + output = pipeline.chat( + history=history, + max_length=body.max_tokens, + do_sample=body.temperature > 0, + top_p=body.top_p, + temperature=body.temperature, + ) + logging.info(f'prompt: "{history[-1]}", sync response: "{output}"') + + return ChatCompletionResponse( + object="chat.completion", + choices=[ChatCompletionResponseChoice(message=ChatMessage(role="assistant", content=output))], + ) + + +class ModelCard(BaseModel): + id: str + object: Literal["model"] = "model" + owned_by: str = "owner" + permission: List = [] + + +class ModelList(BaseModel): + object: Literal["list"] = "list" + data: List[ModelCard] = [] + + model_config = { + "json_schema_extra": { + "examples": [ + { + "object": "list", + "data": [{"id": "gpt-3.5-turbo", "object": "model", "owned_by": "owner", "permission": []}], + } + ] + } + } + + +@app.get("/v1/models") +async def list_models() -> ModelList: + return ModelList(data=[ModelCard(id="gpt-3.5-turbo")])