Skip to main content

Chat Completions

Generate text responses using OpenAI-compatible chat completion endpoint.

Endpoint

POST /api/v1/chat/completions

Basic Request

Using cURL

curl -X POST http://localhost/api/v1/chat/completions \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! How are you?"}
]
}'

Using Python

import requests

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! How are you?"}
]
}
)

data = response.json()
print(data["choices"][0]["message"]["content"])

Using JavaScript

const response = await fetch('http://localhost/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-3.5-turbo',
messages: [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: 'Hello! How are you?' }
]
})
});

const data = await response.json();
console.log(data.choices[0].message.content);

Request Parameters

ParameterTypeRequiredDescriptionDefault
modelstringYesModel ID to use-
messagesarrayYesArray of message objects-
temperaturenumberNoSampling randomness (0-2)1.0
max_tokensintegerNoMax tokens to generate-
top_pnumberNoNucleus sampling (0-1)1.0
top_kintegerNoTop-k sampling-
frequency_penaltynumberNoReduce repetition (-2 to 2)0.0
presence_penaltynumberNoEncourage new topics (-2 to 2)0.0
stopstring/arrayNoStop sequences-
streambooleanNoEnable streamingfalse

Message Format

Messages are an array of objects with these roles:

RolePurposeExample
systemSet behavior and personality"You are a helpful assistant."
userUser input"What is the capital of France?"
assistantAssistant's previous responses"The capital of France is Paris."
functionFunction call resultSee tool calling docs
toolTool responseSee tool calling docs

Example Conversation

{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are a helpful, friendly assistant."
},
{
"role": "user",
"content": "What is 2+2?"
},
{
"role": "assistant",
"content": "2+2 equals 4."
},
{
"role": "user",
"content": "What about 5+3?"
}
]
}

Response Format

{
"id": "chatcmpl-abc123",
"object": "chat.completion",
"created": 1699012345,
"model": "gpt-3.5-turbo",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "5+3 equals 8."
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 50,
"completion_tokens": 10,
"total_tokens": 60
}
}

Examples

System Prompt

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are an expert in Python programming. Provide clear, efficient code examples."
},
{
"role": "user",
"content": "How do I sort a list in Python?"
}
]
}
)

Temperature Control

# Lower temperature = more deterministic
response_low = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Tell me a joke."}],
"temperature": 0.2 # Very focused, consistent
}
)

# Higher temperature = more creative
response_high = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Write a creative story."}],
"temperature": 1.5 # Very creative, varied
}
)

Max Tokens

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Write a long article about AI."}
],
"max_tokens": 500 # Limit to 500 tokens
}
)

Streaming Responses

import requests

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Tell me a story."}
],
"stream": True
},
stream=True
)

for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
try:
chunk = json.loads(data)
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
except json.JSONDecodeError:
pass

Stop Sequences

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Count from 1 to 10."}
],
"stop": ["11", "eleven"] # Stop when model reaches 11
}
)

Multi-turn Conversation

conversation = [
{"role": "system", "content": "You are a helpful assistant."}
]

# First message
response1 = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": conversation + [
{"role": "user", "content": "What's the capital of France?"}
]
}
)
conversation.append({"role": "user", "content": "What's the capital of France?"})
conversation.append({
"role": "assistant",
"content": response1.json()["choices"][0]["message"]["content"]
})

# Second message (with context)
response2 = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": conversation + [
{"role": "user", "content": "And what about Germany?"}
]
}
)
print(response2.json()["choices"][0]["message"]["content"])

Token Usage Tracking

The response includes usage information:

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hello"}]
}
)

usage = response.json()["usage"]
print(f"Prompt tokens: {usage['prompt_tokens']}")
print(f"Completion tokens: {usage['completion_tokens']}")
print(f"Total tokens: {usage['total_tokens']}")

Error Handling

Invalid Model

try:
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "invalid-model",
"messages": [{"role": "user", "content": "Hello"}]
}
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
error = response.json()
print(f"Error: {error['error']['message']}")
print(f"Type: {error['error']['type']}")

Rate Limit

import time
import requests

def make_request_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={"model": "gpt-3.5-turbo", "messages": messages}
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
wait_time = 2 ** attempt # Exponential backoff
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
raise
raise Exception("Max retries exceeded")

Budget Exceeded

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hello"}]}
)

if response.status_code == 429:
error = response.json()
if "budget" in error["error"]["message"].lower():
print("Budget limit exceeded. Check your API key budget settings.")

Best Practices

  1. Include System Prompts - Set behavior and personality
  2. Use Appropriate Temperature - Lower for factual, higher for creative
  3. Handle Streaming - Provide better UX for long responses
  4. Track Token Usage - Monitor costs and adjust parameters
  5. Implement Retry Logic - Handle rate limits gracefully
  6. Cache Responses - Cache identical requests when appropriate

Next Steps