Load Model Query:
python3.12 -m vllm.entrypoints.openai.api_server --max-model-len 4096 --max-num-seqs 1 --no-enable-prefix-caching --port 8009 --tensor-parallel-size 2 --model fraseque/llama-3.2-1B-FP8-Neuron --additional-config '{ "override_neuron_config": { "async_mode": true } }'
v1/completions endpoint call:
curl http://localhost:8009/v1/completions -H "Content-Type: application/json" -d '{
"model": "fraseque/llama-3.2-1B-FP8-Neuron",
"prompt": "My name is"
}'
{"id":"cmpl-96b91c64240240d69fbb6928bf0ded77","object":"text_completion","created":1768319906,"model":"fraseque/llama-3.2-1B-FP8-Neuron","choices":[{"index":0,"text":"rowingobao arbitrary-count Conservamentelve roiularesskin�endencyseyjunctionintonInstant","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":4,"total_tokens":20,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null}
Load Model Query:
python3.12 -m vllm.entrypoints.openai.api_server --max-model-len 4096 --max-num-seqs 1 --no-enable-prefix-caching --port 8009 --tensor-parallel-size 2 --model fraseque/llama-3.2-1B-FP8-Neuron --additional-config '{ "override_neuron_config": { "async_mode": true } }'
v1/completions endpoint call:
curl http://localhost:8009/v1/completions -H "Content-Type: application/json" -d '{
"model": "fraseque/llama-3.2-1B-FP8-Neuron",
"prompt": "My name is"
}'
{"id":"cmpl-96b91c64240240d69fbb6928bf0ded77","object":"text_completion","created":1768319906,"model":"fraseque/llama-3.2-1B-FP8-Neuron","choices":[{"index":0,"text":"rowingobao arbitrary-count Conservamentelve roiularesskin�endencyseyjunctionintonInstant","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":4,"total_tokens":20,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null}