Skip to main content

Embeddings

Convert text into vector representations for semantic search and similarity matching.

Endpoint

POST /api/v1/embeddings

Basic Request

Using cURL

curl -X POST http://localhost/api/v1/embeddings \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "text-embedding-ada-002",
"input": "The quick brown fox jumps over the lazy dog."
}'

Using Python

import requests

response = requests.post(
"http://localhost/api/v1/embeddings",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "text-embedding-ada-002",
"input": "The quick brown fox jumps over the lazy dog."
}
)

data = response.json()
embedding = data["data"][0]["embedding"]
print(f"Embedding dimensions: {len(embedding)}")
print(f"First 5 values: {embedding[:5]}")

Using JavaScript

const response = await fetch('http://localhost/api/v1/embeddings', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'text-embedding-ada-002',
input: 'The quick brown fox jumps over the lazy dog.'
})
});

const data = await response.json();
const embedding = data.data[0].embedding;
console.log(`Embedding dimensions: ${embedding.length}`);
console.log(`First 5 values: ${embedding.slice(0, 5)}`);

Request Parameters

ParameterTypeRequiredDescriptionDefault
modelstringYesEmbedding model to use-
inputstring/arrayYesText(s) to embed-
encoding_formatstringNoFormat of outputfloat

Response Format

{
"object": "list",
"data": [
{
"object": "embedding",
"embedding": [
0.0023,
-0.0235,
0.0527,
...
],
"index": 0
}
],
"model": "text-embedding-ada-002",
"usage": {
"prompt_tokens": 8,
"total_tokens": 8
}
}

Examples

Single Text

response = requests.post(
"http://localhost/api/v1/embeddings",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "text-embedding-ada-002",
"input": "Hello, world!"
}
)

embedding = response.json()["data"][0]["embedding"]
print(f"Generated {len(embedding)}-dimension vector")

Multiple Texts

response = requests.post(
"http://localhost/api/v1/embeddings",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "text-embedding-ada-002",
"input": [
"The cat sat on the mat.",
"The dog ran around the park."
]
}
)

embeddings = response.json()["data"]
for item in embeddings:
print(f"Index {item['index']}: {len(item['embedding'])} dimensions")

Large Documents

text = "Your long document text here..."

# For long texts, consider chunking
chunk_size = 500 # words per chunk
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Generate embeddings for each chunk
embeddings = []
for i, chunk in enumerate(chunks):
response = requests.post(
"http://localhost/api/v1/embeddings",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "text-embedding-ada-002",
"input": chunk
}
)
embedding = response.json()["data"][0]["embedding"]
embeddings.append({"index": i, "embedding": embedding})
print(f"Embedded chunk {i+1}/{len(chunks)}")

print(f"Total embeddings: {len(embeddings)}")

Use Cases

Create embeddings for documents, then search:

# Store documents with embeddings
documents = [
{"text": "Python is a programming language.", "embedding": get_embedding("Python is a programming language.")},
{"text": "JavaScript is used for web development.", "embedding": get_embedding("JavaScript is used for web development.")}
]

# Search
query = "What is Python?"
query_embedding = get_embedding(query)

# Calculate cosine similarity (simplified)
def cosine_similarity(a, b):
import numpy as np
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

results = []
for doc in documents:
similarity = cosine_similarity(query_embedding, doc["embedding"])
results.append({"text": doc["text"], "similarity": similarity})

results.sort(key=lambda x: x["similarity"], reverse=True)
print(f"Most relevant: {results[0]['text']}")

Text Classification

# Create reference embeddings for categories
categories = {
"technical": get_embedding("Technical documentation and manuals"),
"marketing": get_embedding("Marketing and promotional content"),
"support": get_embedding("Customer support and FAQs")
}

# Classify new text
text = "How do I configure the API?"
text_embedding = get_embedding(text)

best_category = None
best_similarity = -1

for category, category_embedding in categories.items():
similarity = cosine_similarity(text_embedding, category_embedding)
if similarity > best_similarity:
best_similarity = similarity
best_category = category

print(f"Category: {best_category} (similarity: {best_similarity:.2f})")

Clustering

import numpy as np
from sklearn.cluster import KMeans

# Get embeddings for multiple texts
texts = ["Python", "JavaScript", "Go", "Rust", "Ruby", "Java"]
embeddings = [get_embedding(text) for text in texts]

# Cluster embeddings
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(embeddings)

for i, (text, cluster) in enumerate(zip(texts, clusters)):
print(f"{text}: Cluster {cluster}")

Best Practices

Chunking Long Text

def chunk_text(text, max_tokens=800):
"""Split text into chunks of max tokens"""
# Simple word-based chunking
words = text.split()
chunks = []
current_chunk = []
current_length = 0

for word in words:
current_chunk.append(word)
current_length += 1
if current_length >= max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0

if current_chunk:
chunks.append(" ".join(current_chunk))

return chunks

text = "Your long document..."
chunks = chunk_text(text)

for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
embedding = get_embedding(chunk)
# Store embedding with chunk metadata

Batch Processing

import time

def batch_embed(texts, batch_size=10, delay=1.0):
"""Process embeddings in batches to avoid rate limits"""
all_embeddings = []

for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]

response = requests.post(
"http://localhost/api/v1/embeddings",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "text-embedding-ada-002",
"input": batch
}
)

all_embeddings.extend(response.json()["data"])
print(f"Processed batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")

if i + batch_size < len(texts):
time.sleep(delay)

return all_embeddings

texts = ["Text 1", "Text 2", "Text 3", ...]
embeddings = batch_embed(texts)

Caching Embeddings

import hashlib
import json
import os

class EmbeddingCache:
def __init__(self, cache_file="embedding_cache.json"):
self.cache_file = cache_file
self.cache = self._load_cache()

def _load_cache(self):
if os.path.exists(self.cache_file):
with open(self.cache_file, 'r') as f:
return json.load(f)
return {}

def _save_cache(self):
with open(self.cache_file, 'w') as f:
json.dump(self.cache, f)

def _get_key(self, text):
return hashlib.md5(text.encode()).hexdigest()

def get(self, text):
key = self._get_key(text)
return self.cache.get(key)

def set(self, text, embedding):
key = self._get_key(text)
self.cache[key] = embedding
self._save_cache()

# Usage
cache = EmbeddingCache()

text = "Python is great!"

# Check cache first
cached = cache.get(text)
if cached:
embedding = cached
print("From cache")
else:
embedding = get_embedding(text)
cache.set(text, embedding)
print("From API")

Integration with RAG

Use embeddings to build a knowledge base:

# 1. Upload and embed documents
documents = load_documents()
for doc in documents:
embedding = get_embedding(doc["content"])
store_in_vector_db({
"id": doc["id"],
"content": doc["content"],
"embedding": embedding,
"metadata": doc["metadata"]
})

# 2. Search
query = "How to use the API?"
query_embedding = get_embedding(query)

results = search_vector_db(query_embedding, top_k=5)

# 3. Use results in chat completion
context = "\n\n".join([r["content"] for r in results])

response = requests.post(
"http://localhost/api/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": f"Answer using this context:\n\n{context}"
},
{
"role": "user",
"content": query
}
]
}
)

Next Steps