Build an AI Customer Support Agent
Build a RAG-powered customer support chatbot that answers questions from your own documentation — using OXLO's Embeddings and Chat APIs.
Embeddings + Chat CompletionsWhat You'll Build
A support bot that can answer questions about your product or service by:
- Indexing your documentation into vector embeddings using OXLO's Embeddings API
- Retrieving the most relevant docs when a user asks a question
- Generating accurate, grounded answers using OXLO's Chat API
- Citing its sources so users can verify the answer
Architecture
User Question
OXLO Embeddings
bge-large
Vector Search
Top 3 matches
Relevant Documents
OXLO Chat API
deepseek-v3.2
Answer + Sources
Prerequisites
- Python 3.10+ installed
- An OXLO API key (Free tier works — get one here)
- Basic Python knowledge
Step 1 — Project Setup
mkdir oxlo-support-bot && cd oxlo-support-bot
pip install openai python-dotenv numpyCreate your .env file:
# .env
OXLO_API_KEY=your_api_key_here
OXLO_BASE_URL=https://api.oxlo.ai/v1Create the project structure:
oxlo-support-bot/
├── .env
├── knowledge_base.py # Document indexing with embeddings
├── support_bot.py # Chat agent with RAG
├── docs/ # Your documentation files
│ ├── getting-started.txt
│ ├── pricing.txt
│ └── troubleshooting.txt
└── run.py # Interactive CLIStep 2 — Build the Knowledge Base
First, let's create some sample documentation, then index it using OXLO's Embeddings API.
Create Sample Docs
# Create docs/ directory with sample content
# docs/getting-started.txt
"""
Getting Started with Acme SaaS
To create an account, visit app.acme.com and click 'Sign Up'.
You'll need a valid email address. After signing up, you'll receive
a verification email within 2 minutes.
To create your first project, go to Dashboard > New Project.
Each project comes with a free API key displayed in the Settings tab.
The free plan includes 1,000 API calls per month. Upgrade to Pro
for 50,000 calls at $29/month.
"""
# docs/pricing.txt
"""
Acme SaaS Pricing
Free Plan: 1,000 API calls/month, 1 project, community support
Pro Plan: 50,000 API calls/month, 10 projects, email support — $29/month
Enterprise: Unlimited calls, custom SLA, dedicated support — Contact us
All plans include SSL encryption and 99.9% uptime SLA.
Billing is monthly. You can cancel anytime from Settings > Billing.
"""
# docs/troubleshooting.txt
"""
Troubleshooting Common Issues
API Key Not Working:
- Check that your API key is active in Settings > API Keys
- Ensure you're using the correct environment (prod vs staging)
- API keys expire after 90 days — regenerate if needed
Rate Limit Errors (429):
- Free plan: 10 requests/second
- Pro plan: 100 requests/second
- Add exponential backoff to your retry logic
Connection Timeout:
- Our API endpoint is api.acme.com
- Check your firewall settings
- Try increasing the timeout to 30 seconds
"""Build the Embedding Index
# knowledge_base.py
"""Knowledge base using OXLO Embeddings API for semantic search."""
import os
import json
import numpy as np
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(
api_key=os.getenv("OXLO_API_KEY"),
base_url=os.getenv("OXLO_BASE_URL", "https://api.oxlo.ai/v1"),
)
EMBEDDING_MODEL = "bge-large" # Free tier!
def chunk_text(text: str, chunk_size: int = 300) -> list[str]:
"""
Split text into overlapping chunks for better retrieval.
Each chunk is small enough for precise retrieval,
but large enough to contain meaningful context.
"""
sentences = text.replace("\n\n", "\n").split("\n")
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
current_chunk.append(sentence)
current_length += len(sentence)
if current_length >= chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_length = len(sentence)
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def get_embedding(text: str) -> list[float]:
"""Get a single embedding vector from OXLO's Embeddings API."""
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=text,
)
return response.data[0].embedding
def get_embeddings_batch(texts: list[str]) -> list[list[float]]:
"""
Get embeddings for multiple texts in a single API call.
Much faster than calling get_embedding() in a loop!
"""
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=texts,
)
return [item.embedding for item in response.data]
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a = np.array(a)
b = np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
class KnowledgeBase:
"""
Simple in-memory vector store for document retrieval.
In production, you'd use a vector DB like Pinecone, Weaviate,
or pgvector.
"""
def __init__(self):
self.documents: list[dict] = []
def index_directory(self, docs_dir: str):
"""Index all .txt files using batch embedding (single API call)."""
docs_path = Path(docs_dir)
all_chunks = []
all_sources = []
for file_path in docs_path.glob("*.txt"):
print(f" Indexing {file_path.name}...")
text = file_path.read_text(encoding="utf-8")
chunks = chunk_text(text)
all_chunks.extend(chunks)
all_sources.extend([file_path.name] * len(chunks))
# Single batch API call instead of one call per chunk
print(f" Embedding {len(all_chunks)} chunks in one batch...")
embeddings = get_embeddings_batch(all_chunks)
for chunk, source, embedding in zip(all_chunks, all_sources, embeddings):
self.documents.append({
"text": chunk,
"source": source,
"embedding": embedding,
})
print(f" Indexed {len(self.documents)} chunks from {docs_dir}")
def search(self, query: str, top_k: int = 3) -> list[dict]:
"""Search the knowledge base using semantic similarity."""
query_embedding = get_embedding(query)
scored = []
for doc in self.documents:
score = cosine_similarity(query_embedding, doc["embedding"])
scored.append({**doc, "score": score})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[:top_k]
def save(self, path: str = "index.json"):
"""Save the index to disk (avoids re-embedding on restart)."""
with open(path, "w") as f:
json.dump(self.documents, f)
print(f" Saved index to {path}")
def load(self, path: str = "index.json"):
"""Load a previously saved index."""
with open(path) as f:
self.documents = json.load(f)
print(f" Loaded {len(self.documents)} chunks from {path}")Try it out:
# test_knowledge_base.py
from knowledge_base import KnowledgeBase
# Index your docs
kb = KnowledgeBase()
kb.index_directory("docs/")
# Search for relevant info
results = kb.search("How much does the pro plan cost?")
for r in results:
print(f"Score: {r['score']:.3f} | Source: {r['source']}")
print(f" {r['text'][:100]}...")
print()Expected output:
📄 Indexing getting-started.txt...
📄 Indexing pricing.txt...
📄 Indexing troubleshooting.txt...
✅ Indexed 8 chunks from docs/
Score: 0.847 | Source: pricing.txt
Acme SaaS Pricing Free Plan: 1,000 API calls/month, 1 project, community support Pro Plan: 50,000...
Score: 0.721 | Source: getting-started.txt
The free plan includes 1,000 API calls per month. Upgrade to Pro for 50,000 calls at $29/month...
Score: 0.634 | Source: pricing.txt
All plans include SSL encryption and 99.9% uptime SLA. Billing is monthly. You can cancel anytime...Step 3 — Build the Support Bot
Now combine retrieval with the Chat API to generate grounded answers:
# support_bot.py
"""AI Customer Support bot using RAG with OXLO APIs."""
import os
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from knowledge_base import KnowledgeBase
load_dotenv()
client = OpenAI(
api_key=os.getenv("OXLO_API_KEY"),
base_url=os.getenv("OXLO_BASE_URL", "https://api.oxlo.ai/v1"),
)
CHAT_MODEL = "deepseek-v3.2" # Free tier!
SYSTEM_PROMPT = """You are a helpful customer support assistant for Acme SaaS.
RULES:
1. Answer questions ONLY using the provided context documents.
2. If the context doesn't contain the answer, say "I don't have that information in my knowledge base. Please contact support@acme.com."
3. Always cite which document your answer came from.
4. Be concise and friendly.
5. If the user greets you, respond warmly and ask how you can help."""
class SupportBot:
"""RAG-powered customer support chatbot."""
def __init__(self, docs_dir: str = "docs/", index_path: str = "index.json"):
self.kb = KnowledgeBase()
self.conversation_history: list[dict] = []
# Load cached index if available, otherwise build and cache it
if Path(index_path).exists():
print("Loading cached knowledge base...")
self.kb.load(index_path)
else:
print("Building knowledge base (first run)...")
self.kb.index_directory(docs_dir)
self.kb.save(index_path)
print("Support bot ready!\n")
def answer(self, user_question: str) -> str:
"""
Answer a user question using RAG.
1. Retrieve relevant docs from the knowledge base
2. Feed them as context to the LLM
3. Generate a grounded answer
"""
# Step 1: Retrieve relevant documents
relevant_docs = self.kb.search(user_question, top_k=3)
# Step 2: Format context for the LLM
context = "\n\n".join([
f"[Source: {doc['source']}]\n{doc['text']}"
for doc in relevant_docs
])
# Step 3: Build the prompt with retrieved context
self.conversation_history.append({
"role": "user",
"content": user_question,
})
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "system",
"content": f"Context documents:\n\n{context}",
},
*self.conversation_history,
]
# Step 4: Generate answer with OXLO Chat API
response = client.chat.completions.create(
model=CHAT_MODEL,
messages=messages,
max_tokens=500,
temperature=0.3,
)
answer = response.choices[0].message.content
# Add to conversation history for follow-up questions
self.conversation_history.append({
"role": "assistant",
"content": answer,
})
# Keep conversation history manageable
if len(self.conversation_history) > 20:
self.conversation_history = self.conversation_history[-10:]
return answerStep 4 — Build the Chat Interface
Create a simple interactive CLI to chat with your bot:
# run.py
"""Interactive CLI for the OXLO Support Bot."""
from support_bot import SupportBot
def main():
bot = SupportBot(docs_dir="docs/")
print("━" * 50)
print(" 🤖 Acme SaaS Support Bot")
print(" Powered by OXLO AI")
print(" Type 'quit' to exit")
print("━" * 50)
print()
while True:
user_input = input("You: ").strip()
if not user_input:
continue
if user_input.lower() in ("quit", "exit", "bye"):
print("Bot: Goodbye! 👋")
break
answer = bot.answer(user_input)
print(f"\nBot: {answer}\n")
if __name__ == "__main__":
main()Step 5 — Run and Test
python run.pyExample conversation:
🔄 Building knowledge base (first run)...
Indexing getting-started.txt...
Indexing pricing.txt...
Indexing troubleshooting.txt...
Embedding 8 chunks in one batch...
Indexed 8 chunks from docs/
Saved index to index.json
Support bot ready!
# Second run loads instantly (no API calls):
# 🔄 Loading cached knowledge base...
# Loaded 8 chunks from index.json
# Support bot ready!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🤖 Acme SaaS Support Bot
Powered by OXLO AI
Type 'quit' to exit
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
You: How much does the Pro plan cost?
Bot: The Pro plan costs **$29/month** and includes:
- 50,000 API calls per month
- Up to 10 projects
- Email support
Billing is monthly and you can cancel anytime from Settings > Billing.
📄 Source: pricing.txt
You: My API key isn't working
Bot: Here are some things to check:
1. **Verify it's active** — Go to Settings > API Keys and confirm
2. **Check the environment** — Make sure you're using the right key for prod vs staging
3. **Check expiry** — API keys expire after 90 days. Regenerate if needed
If none of these help, please contact support@acme.com.
📄 Source: troubleshooting.txt
You: What's your refund policy?
Bot: I don't have that information in my knowledge base. Please contact
support@acme.com for refund-related questions.
You: quit
Bot: Goodbye! 👋Performance Tips
If you notice slow responses, here are the optimizations already built into the code above and additional tips for production:
Batch Embedding (already applied)
The index_directory() method embeds all chunks in a single API call using get_embeddings_batch() instead of one call per chunk. This reduces indexing from ~8 sequential HTTP round-trips to just 1, cutting startup time by 5-10x.
Index Caching (already applied)
The bot saves the embedding index to index.json after the first run. On subsequent starts, it loads the cached index instantly (no API calls needed). Delete index.json to force a re-index when your docs change.
Query Latency Breakdown
Each user query makes 2 API calls: one to embed the query (~100-200ms) and one to generate the answer (~500-2000ms). The embedding call is the minimum overhead for semantic search. For production, use async (AsyncOpenAI) to avoid blocking, and consider caching frequent queries.
Model Selection for Speed
deepseek-v3.2 is powerful but large. For faster responses in a support bot, try mistral-7b or llama-3.2-3b (both free tier) which respond 2-5x faster and are sufficient for grounded Q&A tasks.
Extending Your Bot
Add a FastAPI Endpoint
Expose your bot as a REST API for integration with websites or apps:
# api_server.py
"""Expose the support bot as a REST API."""
from fastapi import FastAPI
from pydantic import BaseModel
from support_bot import SupportBot
app = FastAPI(title="Acme Support Bot API")
bot = SupportBot(docs_dir="docs/")
class ChatRequest(BaseModel):
message: str
session_id: str = "default"
class ChatResponse(BaseModel):
answer: str
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
answer = bot.answer(request.message)
return ChatResponse(answer=answer)
# Run with: uvicorn api_server:app --reloadUse a Vector Database
For production, replace the in-memory store with a proper vector database:
# Example: Using ChromaDB for persistent storage
import chromadb
chroma = chromadb.PersistentClient(path="./chroma_db")
collection = chroma.get_or_create_collection("support_docs")
# When indexing:
embedding = get_embedding(chunk) # OXLO Embeddings API
collection.add(
ids=[f"doc_{i}"],
embeddings=[embedding],
documents=[chunk],
metadatas=[{"source": filename}],
)
# When searching:
query_embedding = get_embedding(user_question) # OXLO Embeddings API
results = collection.query(
query_embeddings=[query_embedding],
n_results=3,
)Add Multi-Language Support
OXLO models like Qwen 3 32B and DeepSeek V3.2 support 100+ languages out of the box. Just add a note in your system prompt:
SYSTEM_PROMPT = """You are a helpful customer support assistant.
Always respond in the same language the user writes in.
..."""Key Concepts
📊 Embeddings for Retrieval
OXLO's bge-large model converts text into 1024-dimensional vectors. Similar texts produce similar vectors, enabling semantic search — the user doesn't need to use exact keywords.
🎯 Grounding with Context
By injecting retrieved documents into the system prompt, the LLM generates answers grounded in your actual data — dramatically reducing hallucinations compared to using the LLM alone.
💬 Conversation Memory
The bot keeps a conversation history so users can ask follow-up questions naturally. The history is trimmed to the last 10 exchanges to stay within the model's context window.