Skip to main content

LlamaIndex Integration

Integrate Enclava with LlamaIndex for advanced RAG applications.

Installation

pip install llama-index
pip install llama-index-llms-openai

Basic Configuration

from llama_index.llms.openai import OpenAI as LlamaOpenAI

# Configure with Enclava endpoint
llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

# Simple completion
response = llm.complete("Hello, LlamaIndex!")
print(response.text)

Complete Example

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding

class EnclavaLlamaIndex:
def __init__(self, api_key, base_url="https://localhost/api/v1"):
self.llm = LlamaOpenAI(
api_base=base_url,
api_key=api_key,
model="gpt-3.5-turbo"
)
self.embed_model = OpenAIEmbedding(
api_base=base_url,
api_key=api_key
)

def complete(self, prompt):
"""Generate completion"""
response = self.llm.complete(prompt)
return response.text

def chat(self, messages):
"""Chat with messages"""
response = self.llm.chat(messages)
return response.message.content

# Usage
client = EnclavaLlamaIndex(api_key="YOUR_API_KEY")

# Complete
response = client.complete("Tell me about AI")
print(response)

# Chat
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "What is 2+2?"}
]
response = client.chat(messages)
print(response)

RAG with LlamaIndex

Simple RAG

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Configure Enclava
llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

embed_model = OpenAIEmbedding(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY"
)

# Load documents
documents = SimpleDirectoryReader("documents/").load_data()

# Create index
index = VectorStoreIndex.from_documents(
documents,
embed_model=embed_model
)

# Create query engine
query_engine = index.as_query_engine(llm=llm)

# Query
response = query_engine.query("What is Enclava?")
print(f"Answer: {response}")
print(f"Sources: {response.source_nodes}")

RAG with Custom Retrieval

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.indices.query.schema import QueryType

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

embed_model = OpenAIEmbedding(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY"
)

# Load and index documents
documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Create query engine with custom settings
query_engine = index.as_query_engine(
llm=llm,
similarity_top_k=5, # Retrieve top 5 documents
response_mode="compact", # Compact retrieved context
query_type=QueryType.DEFAULT
)

# Query
response = query_engine.query("How do I use RAG in Enclava?")
print(f"Answer: {response}")

RAG with Streaming

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo",
streaming=True
)

embed_model = OpenAIEmbedding(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY"
)

documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Create streaming query engine
query_engine = index.as_query_engine(
llm=llm,
streaming=True
)

# Stream response
query = "Tell me about Enclava's features"
response_stream = query_engine.query(query)

for text in response_stream.response_gen:
print(text, end="", flush=True)

Chat Engine

Simple Chat

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

embed_model = OpenAIEmbedding(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY"
)

documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Create chat engine with memory
chat_engine = index.as_chat_engine(
llm=llm,
chat_mode="condense_question", # Condense follow-up questions
memory=False, # Let LlamaIndex handle memory
verbose=True
)

# Chat
response1 = chat_engine.chat("What is Enclava?")
print(f"User: What is Enclava?")
print(f"AI: {response1}")

response2 = chat_engine.chat("How does it work?")
print(f"User: How does it work?")
print(f"AI: {response2}")

Chat with Streaming

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo",
streaming=True
)

documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

chat_engine = index.as_chat_engine(
llm=llm,
streaming=True,
chat_mode="condense_question"
)

# Stream chat response
response_stream = chat_engine.stream_chat("Tell me a story")

for text in response_stream.response_gen:
print(text, end="", flush=True)

Index Management

Persist Index

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore

# Create storage context
storage_context = StorageContext.from_defaults(
docstore=SimpleDocumentStore.from_persist_dir("./storage"),
index_store=SimpleIndexStore.from_persist_dir("./storage")
)

# Create and persist index
documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
embed_model=embed_model
)

index.storage_context.persist("./storage")

Load Persisted Index

from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext

# Load from storage
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

# Create query engine
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("What is Enclava?")
print(response)

Advanced RAG

Hybrid Search (Keyword + Semantic)

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.indices.query.schema import QueryType

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Use hybrid search
query_engine = index.as_query_engine(
llm=llm,
query_type=QueryType.HYBRID, # Hybrid keyword + semantic
similarity_top_k=10
)

response = query_engine.query("How do I configure API keys?")
print(response)

Reranking

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.indices.query.schema import QueryType

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

documents = SimpleDirectoryReader("documents/").load_data()
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Use reranking for better results
query_engine = index.as_query_engine(
llm=llm,
query_type=QueryType.DEFAULT,
similarity_top_k=20, # Retrieve more documents
node_postprocessors=[ # Rerank retrieved nodes
lambda nodes: sorted(nodes, key=lambda n: n.score, reverse=True)[:10]
]
)

response = query_engine.query("What are the security features?")
print(response)

Multi-Index RAG

from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

llm = LlamaOpenAI(
api_base="https://your-enclava-instance/api/v1",
api_key="YOUR_API_KEY",
model="gpt-3.5-turbo"
)

# Create multiple indexes
docs_index = VectorStoreIndex.from_documents(
SimpleDirectoryReader("docs/").load_data(),
embed_model=embed_model
)

faq_index = VectorStoreIndex.from_documents(
SimpleDirectoryReader("faq/").load_data(),
embed_model=embed_model
)

# Create query engines for each index
docs_engine = docs_index.as_query_engine(llm=llm, similarity_top_k=3)
faq_engine = faq_index.as_query_engine(llm=llm, similarity_top_k=2)

# Query multiple indexes and combine results
query = "How do I create an API key?"

docs_response = docs_engine.query(query)
faq_response = faq_engine.query(query)

# Combine results
combined_context = f"""
Documentation:
{docs_response}

FAQ:
{faq_response}
"""

# Generate final answer
final_response = llm.complete(
f"Answer this question using the context below:\n\n{combined_context}\n\nQuestion: {query}"
)

print(final_response.text)

Best Practices

  1. Chunking: Use appropriate chunk sizes for your documents
  2. Embedding Model: Choose embedding model based on your needs
  3. Retrieval: Adjust similarity_top_k based on document size
  4. Memory: Use chat engine for conversational applications
  5. Persistence: Persist indexes to avoid re-embedding
  6. Hybrid Search: Use hybrid search for better keyword matching

Next Steps