Chat Completions
Generate text responses using OpenAI-compatible chat completion endpoint.
Endpoint
POST /api/v1/chat/completions
Basic Request
Using cURL
curl -X POST http://localhost/api/v1/chat/completions \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! How are you?"}
]
}'
Using Python
import requests
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! How are you?"}
]
}
)
data = response.json()
print(data["choices"][0]["message"]["content"])
Using JavaScript
const response = await fetch('http://localhost/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-3.5-turbo',
messages: [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: 'Hello! How are you?' }
]
})
});
const data = await response.json();
console.log(data.choices[0].message.content);
Request Parameters
| Parameter | Type | Required | Description | Default |
|---|---|---|---|---|
model | string | Yes | Model ID to use | - |
messages | array | Yes | Array of message objects | - |
temperature | number | No | Sampling randomness (0-2) | 1.0 |
max_tokens | integer | No | Max tokens to generate | - |
top_p | number | No | Nucleus sampling (0-1) | 1.0 |
top_k | integer | No | Top-k sampling | - |
frequency_penalty | number | No | Reduce repetition (-2 to 2) | 0.0 |
presence_penalty | number | No | Encourage new topics (-2 to 2) | 0.0 |
stop | string/array | No | Stop sequences | - |
stream | boolean | No | Enable streaming | false |
Message Format
Messages are an array of objects with these roles:
| Role | Purpose | Example |
|---|---|---|
system | Set behavior and personality | "You are a helpful assistant." |
user | User input | "What is the capital of France?" |
assistant | Assistant's previous responses | "The capital of France is Paris." |
function | Function call result | See tool calling docs |
tool | Tool response | See tool calling docs |
Example Conversation
{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are a helpful, friendly assistant."
},
{
"role": "user",
"content": "What is 2+2?"
},
{
"role": "assistant",
"content": "2+2 equals 4."
},
{
"role": "user",
"content": "What about 5+3?"
}
]
}
Response Format
{
"id": "chatcmpl-abc123",
"object": "chat.completion",
"created": 1699012345,
"model": "gpt-3.5-turbo",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "5+3 equals 8."
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 50,
"completion_tokens": 10,
"total_tokens": 60
}
}
Examples
System Prompt
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are an expert in Python programming. Provide clear, efficient code examples."
},
{
"role": "user",
"content": "How do I sort a list in Python?"
}
]
}
)
Temperature Control
# Lower temperature = more deterministic
response_low = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Tell me a joke."}],
"temperature": 0.2 # Very focused, consistent
}
)
# Higher temperature = more creative
response_high = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Write a creative story."}],
"temperature": 1.5 # Very creative, varied
}
)
Max Tokens
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Write a long article about AI."}
],
"max_tokens": 500 # Limit to 500 tokens
}
)
Streaming Responses
import requests
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Tell me a story."}
],
"stream": True
},
stream=True
)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
try:
chunk = json.loads(data)
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
except json.JSONDecodeError:
pass
Stop Sequences
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Count from 1 to 10."}
],
"stop": ["11", "eleven"] # Stop when model reaches 11
}
)
Multi-turn Conversation
conversation = [
{"role": "system", "content": "You are a helpful assistant."}
]
# First message
response1 = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": conversation + [
{"role": "user", "content": "What's the capital of France?"}
]
}
)
conversation.append({"role": "user", "content": "What's the capital of France?"})
conversation.append({
"role": "assistant",
"content": response1.json()["choices"][0]["message"]["content"]
})
# Second message (with context)
response2 = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": conversation + [
{"role": "user", "content": "And what about Germany?"}
]
}
)
print(response2.json()["choices"][0]["message"]["content"])
Token Usage Tracking
The response includes usage information:
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hello"}]
}
)
usage = response.json()["usage"]
print(f"Prompt tokens: {usage['prompt_tokens']}")
print(f"Completion tokens: {usage['completion_tokens']}")
print(f"Total tokens: {usage['total_tokens']}")
Error Handling
Invalid Model
try:
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "invalid-model",
"messages": [{"role": "user", "content": "Hello"}]
}
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
error = response.json()
print(f"Error: {error['error']['message']}")
print(f"Type: {error['error']['type']}")
Rate Limit
import time
import requests
def make_request_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={"model": "gpt-3.5-turbo", "messages": messages}
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
wait_time = 2 ** attempt # Exponential backoff
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
raise
raise Exception("Max retries exceeded")
Budget Exceeded
response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hello"}]}
)
if response.status_code == 429:
error = response.json()
if "budget" in error["error"]["message"].lower():
print("Budget limit exceeded. Check your API key budget settings.")
Best Practices
- Include System Prompts - Set behavior and personality
- Use Appropriate Temperature - Lower for factual, higher for creative
- Handle Streaming - Provide better UX for long responses
- Track Token Usage - Monitor costs and adjust parameters
- Implement Retry Logic - Handle rate limits gracefully
- Cache Responses - Cache identical requests when appropriate
Next Steps
- Embeddings - Generate text embeddings
- Integrations - Use with popular frameworks
- Chatbots - Create persistent chatbots with RAG