A RAG (Retrieval-Augmented Generation) chatbot answers questions based on your own documents — not just its training data. This guide builds one from scratch using Python, ChromaDB, and Claude. By the end, you’ll have a chatbot that can answer questions about any PDF or text file you give it.
What Is RAG?
RAG combines two things:
- Retrieval: search your documents for relevant chunks
- Generation: use an LLM to write an answer based on those chunks
Without RAG, Claude can only answer questions based on its training data. With RAG, you inject relevant context directly into the prompt — making the model answer from your data.
Architecture
The chatbot has two phases:
- Indexing: load documents → split into chunks → embed each chunk → store in vector database
- Querying: embed user question → find similar chunks → send chunks + question to Claude → return answer
Prerequisites
- Python 3.9+
- Anthropic API key (set as
ANTHROPIC_API_KEY) - Basic Python knowledge
Step 1: Install Dependencies
pip install anthropic chromadb sentence-transformers pypdf2What each package does:
anthropic— Claude APIchromadb— local vector databasesentence-transformers— embed text into vectorspypdf2— extract text from PDFs
Step 2: Load and Chunk Documents
Split documents into overlapping chunks so context isn’t lost at boundaries:
import PyPDF2
from pathlib import Path
def load_pdf(path: str) -> str:
with open(path, "rb") as f:
reader = PyPDF2.PdfReader(f)
return "\n".join(page.extract_text() for page in reader.pages)
def load_txt(path: str) -> str:
return Path(path).read_text(encoding="utf-8")
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i : i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunksStep 3: Build the Vector Index
Embed each chunk and store it in ChromaDB:
import chromadb
from chromadb.utils import embedding_functions
# Use a lightweight embedding model (runs locally, no API needed)
EMBED_MODEL = "all-MiniLM-L6-v2"
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=EMBED_MODEL
)
# Create a persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")
def build_index(documents: list[dict], collection_name: str = "docs"):
"""
documents: [{"text": "...", "source": "file.pdf"}, ...]
"""
collection = client.get_or_create_collection(
name=collection_name,
embedding_function=embed_fn,
)
texts, ids, metadatas = [], [], []
for i, doc in enumerate(documents):
chunks = chunk_text(doc["text"])
for j, chunk in enumerate(chunks):
texts.append(chunk)
ids.append(f"{doc['source']}_chunk_{j}")
metadatas.append({"source": doc["source"], "chunk": j})
collection.add(documents=texts, ids=ids, metadatas=metadatas)
print(f"Indexed {len(texts)} chunks into '{collection_name}'")
return collectionStep 4: Query the Index
Find the most relevant chunks for a user’s question:
def retrieve(question: str, collection, n_results: int = 5) -> list[str]:
results = collection.query(
query_texts=[question],
n_results=n_results,
)
return results["documents"][0] # list of matching chunksStep 5: Generate an Answer with Claude
Pack the retrieved chunks into the prompt and ask Claude to answer:
import anthropic
claude = anthropic.Anthropic()
def generate_answer(question: str, context_chunks: list[str]) -> str:
context = "\n\n---\n\n".join(context_chunks)
prompt = f"""You are a helpful assistant. Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't have that information."
Context:
{context}
Question: {question}
Answer:"""
response = claude.messages.create(
model="claude-sonnet-4-5",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].textStep 6: Tie It Together — Full Chatbot
import anthropic
import chromadb
from chromadb.utils import embedding_functions
import PyPDF2
from pathlib import Path
# ── Config ────────────────────────────────────────────────────────────────────
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION = "docs"
DB_PATH = "./chroma_db"
# ── Helpers ───────────────────────────────────────────────────────────────────
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)
chroma = chromadb.PersistentClient(path=DB_PATH)
claude = anthropic.Anthropic()
def load_document(path: str) -> str:
p = Path(path)
if p.suffix.lower() == ".pdf":
with open(p, "rb") as f:
reader = PyPDF2.PdfReader(f)
return "\n".join(page.extract_text() for page in reader.pages)
return p.read_text(encoding="utf-8")
def chunk_text(text: str, size: int = 500, overlap: int = 50) -> list[str]:
words = text.split()
chunks, i = [], 0
while i < len(words):
chunks.append(" ".join(words[i : i + size]))
i += size - overlap
return chunks
def index_files(file_paths: list[str]) -> None:
collection = chroma.get_or_create_collection(COLLECTION, embedding_function=embed_fn)
texts, ids, metas = [], [], []
for path in file_paths:
text = load_document(path)
for j, chunk in enumerate(chunk_text(text)):
texts.append(chunk)
ids.append(f"{Path(path).name}_chunk_{j}")
metas.append({"source": path})
collection.add(documents=texts, ids=ids, metadatas=metas)
print(f"Indexed {len(texts)} chunks from {len(file_paths)} files")
def ask(question: str, n_results: int = 5) -> str:
collection = chroma.get_collection(COLLECTION, embedding_function=embed_fn)
results = collection.query(query_texts=[question], n_results=n_results)
chunks = results["documents"][0]
context = "\n\n---\n\n".join(chunks)
response = claude.messages.create(
model="claude-sonnet-4-5",
max_tokens=1024,
messages=[
{
"role": "user",
"content": (
f"Answer based ONLY on the context below.\n"
f"If not found, say 'I don't have that information.'\n\n"
f"Context:\n{context}\n\nQuestion: {question}"
),
}
],
)
return response.content[0].text
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
# Index mode: python chatbot.py doc1.pdf doc2.txt
index_files(sys.argv[1:])
else:
# Chat mode
print("RAG Chatbot ready. Type 'quit' to exit.\n")
while True:
q = input("You: ").strip()
if q.lower() in ("quit", "exit"):
break
print("Bot:", ask(q), "\n")Using the Chatbot
First, index your documents:
python chatbot.py company_handbook.pdf product_docs.txtThen chat:
python chatbot.py
# RAG Chatbot ready. Type 'quit' to exit.
# You: What is the return policy?
# Bot: According to the handbook, returns are accepted within 30 days...Adding Conversation History
For multi-turn conversations, maintain a history alongside the RAG context:
history = []
def ask_with_history(question: str) -> str:
collection = chroma.get_collection(COLLECTION, embedding_function=embed_fn)
chunks = collection.query(query_texts=[question], n_results=3)["documents"][0]
context = "\n\n".join(chunks)
# Build messages: system + history + new question
messages = history + [
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {question}",
}
]
response = claude.messages.create(
model="claude-sonnet-4-5",
max_tokens=1024,
system="Answer using the provided context. Be concise.",
messages=messages,
)
answer = response.content[0].text
# Update history
history.append({"role": "user", "content": question})
history.append({"role": "assistant", "content": answer})
return answerPerformance Tips
- Chunk size matters: 300–600 words is a good starting range. Too small = missing context; too large = noise
- Increase
n_resultsfor complex questions that need more context - Metadata filtering: add
where={'source': 'specific_file.pdf'}to query to limit search to a specific document - Reranking: for production, run a cross-encoder reranker on the top 20 results, then pass only top 5 to Claude
- Caching: cache embeddings so re-indexing the same file is fast
Summary
You’ve built a working RAG chatbot that:
- Loads PDF and text documents
- Splits them into overlapping chunks
- Embeds and stores chunks in ChromaDB
- Retrieves relevant chunks for each question
- Generates grounded answers with Claude
Next steps: explore RAG Tutorial with Python for more advanced techniques, or read the RAG vs Fine-Tuning comparison to decide which approach fits your use case.