How to Build a RAG Chatbot with Python

A RAG (Retrieval-Augmented Generation) chatbot answers questions based on your own documents — not just its training data. This guide builds one from scratch using Python, ChromaDB, and Claude. By the end, you’ll have a chatbot that can answer questions about any PDF or text file you give it.


What Is RAG?

RAG combines two things:

  • Retrieval: search your documents for relevant chunks
  • Generation: use an LLM to write an answer based on those chunks

Without RAG, Claude can only answer questions based on its training data. With RAG, you inject relevant context directly into the prompt — making the model answer from your data.


Architecture

The chatbot has two phases:

  1. Indexing: load documents → split into chunks → embed each chunk → store in vector database
  2. Querying: embed user question → find similar chunks → send chunks + question to Claude → return answer

Prerequisites

  • Python 3.9+
  • Anthropic API key (set as ANTHROPIC_API_KEY)
  • Basic Python knowledge

Step 1: Install Dependencies

pip install anthropic chromadb sentence-transformers pypdf2

What each package does:

  • anthropic — Claude API
  • chromadb — local vector database
  • sentence-transformers — embed text into vectors
  • pypdf2 — extract text from PDFs

Step 2: Load and Chunk Documents

Split documents into overlapping chunks so context isn’t lost at boundaries:

import PyPDF2
from pathlib import Path


def load_pdf(path: str) -> str:
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join(page.extract_text() for page in reader.pages)


def load_txt(path: str) -> str:
    return Path(path).read_text(encoding="utf-8")


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i : i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

Step 3: Build the Vector Index

Embed each chunk and store it in ChromaDB:

import chromadb
from chromadb.utils import embedding_functions

# Use a lightweight embedding model (runs locally, no API needed)
EMBED_MODEL = "all-MiniLM-L6-v2"
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

# Create a persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")


def build_index(documents: list[dict], collection_name: str = "docs"):
    """
    documents: [{"text": "...", "source": "file.pdf"}, ...]
    """
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embed_fn,
    )

    texts, ids, metadatas = [], [], []
    for i, doc in enumerate(documents):
        chunks = chunk_text(doc["text"])
        for j, chunk in enumerate(chunks):
            texts.append(chunk)
            ids.append(f"{doc['source']}_chunk_{j}")
            metadatas.append({"source": doc["source"], "chunk": j})

    collection.add(documents=texts, ids=ids, metadatas=metadatas)
    print(f"Indexed {len(texts)} chunks into '{collection_name}'")
    return collection

Step 4: Query the Index

Find the most relevant chunks for a user’s question:

def retrieve(question: str, collection, n_results: int = 5) -> list[str]:
    results = collection.query(
        query_texts=[question],
        n_results=n_results,
    )
    return results["documents"][0]  # list of matching chunks

Step 5: Generate an Answer with Claude

Pack the retrieved chunks into the prompt and ask Claude to answer:

import anthropic

claude = anthropic.Anthropic()


def generate_answer(question: str, context_chunks: list[str]) -> str:
    context = "\n\n---\n\n".join(context_chunks)

    prompt = f"""You are a helpful assistant. Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't have that information."

Context:
{context}

Question: {question}

Answer:"""

    response = claude.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}],
    )
    return response.content[0].text

Step 6: Tie It Together — Full Chatbot

import anthropic
import chromadb
from chromadb.utils import embedding_functions
import PyPDF2
from pathlib import Path

# ── Config ────────────────────────────────────────────────────────────────────
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION = "docs"
DB_PATH = "./chroma_db"

# ── Helpers ───────────────────────────────────────────────────────────────────
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)
chroma = chromadb.PersistentClient(path=DB_PATH)
claude = anthropic.Anthropic()


def load_document(path: str) -> str:
    p = Path(path)
    if p.suffix.lower() == ".pdf":
        with open(p, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            return "\n".join(page.extract_text() for page in reader.pages)
    return p.read_text(encoding="utf-8")


def chunk_text(text: str, size: int = 500, overlap: int = 50) -> list[str]:
    words = text.split()
    chunks, i = [], 0
    while i < len(words):
        chunks.append(" ".join(words[i : i + size]))
        i += size - overlap
    return chunks


def index_files(file_paths: list[str]) -> None:
    collection = chroma.get_or_create_collection(COLLECTION, embedding_function=embed_fn)
    texts, ids, metas = [], [], []
    for path in file_paths:
        text = load_document(path)
        for j, chunk in enumerate(chunk_text(text)):
            texts.append(chunk)
            ids.append(f"{Path(path).name}_chunk_{j}")
            metas.append({"source": path})
    collection.add(documents=texts, ids=ids, metadatas=metas)
    print(f"Indexed {len(texts)} chunks from {len(file_paths)} files")


def ask(question: str, n_results: int = 5) -> str:
    collection = chroma.get_collection(COLLECTION, embedding_function=embed_fn)
    results = collection.query(query_texts=[question], n_results=n_results)
    chunks = results["documents"][0]
    context = "\n\n---\n\n".join(chunks)

    response = claude.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": (
                    f"Answer based ONLY on the context below.\n"
                    f"If not found, say 'I don't have that information.'\n\n"
                    f"Context:\n{context}\n\nQuestion: {question}"
                ),
            }
        ],
    )
    return response.content[0].text


# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1:
        # Index mode: python chatbot.py doc1.pdf doc2.txt
        index_files(sys.argv[1:])
    else:
        # Chat mode
        print("RAG Chatbot ready. Type 'quit' to exit.\n")
        while True:
            q = input("You: ").strip()
            if q.lower() in ("quit", "exit"):
                break
            print("Bot:", ask(q), "\n")

Using the Chatbot

First, index your documents:

python chatbot.py company_handbook.pdf product_docs.txt

Then chat:

python chatbot.py
# RAG Chatbot ready. Type 'quit' to exit.
# You: What is the return policy?
# Bot: According to the handbook, returns are accepted within 30 days...

Adding Conversation History

For multi-turn conversations, maintain a history alongside the RAG context:

history = []


def ask_with_history(question: str) -> str:
    collection = chroma.get_collection(COLLECTION, embedding_function=embed_fn)
    chunks = collection.query(query_texts=[question], n_results=3)["documents"][0]
    context = "\n\n".join(chunks)

    # Build messages: system + history + new question
    messages = history + [
        {
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion: {question}",
        }
    ]

    response = claude.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=1024,
        system="Answer using the provided context. Be concise.",
        messages=messages,
    )
    answer = response.content[0].text

    # Update history
    history.append({"role": "user", "content": question})
    history.append({"role": "assistant", "content": answer})

    return answer

Performance Tips

  • Chunk size matters: 300–600 words is a good starting range. Too small = missing context; too large = noise
  • Increase n_results for complex questions that need more context
  • Metadata filtering: add where={'source': 'specific_file.pdf'} to query to limit search to a specific document
  • Reranking: for production, run a cross-encoder reranker on the top 20 results, then pass only top 5 to Claude
  • Caching: cache embeddings so re-indexing the same file is fast

Summary

You’ve built a working RAG chatbot that:

  • Loads PDF and text documents
  • Splits them into overlapping chunks
  • Embeds and stores chunks in ChromaDB
  • Retrieves relevant chunks for each question
  • Generates grounded answers with Claude

Next steps: explore RAG Tutorial with Python for more advanced techniques, or read the RAG vs Fine-Tuning comparison to decide which approach fits your use case.