import os import fitz # PyMuPDF from qdrant_client import QdrantClient from qdrant_client.http import models import requests LOCALAI_URL = os.getenv("LOCALAI_URL", "http://localhost:8500") QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") COLLECTION_NAME = "planning_docs" qdrant = QdrantClient(url=QDRANT_URL) def embed_text(text: str): r = requests.post( f"{LOCALAI_URL}/v1/embeddings", json={"model": "nomic-embed-text", "input": text} ) resp = r.json() if "data" not in resp: print("Embedding error:", resp) raise ValueError(f"Embedding failed: {resp}") return resp["data"][0]["embedding"] def chunk_text(text, chunk_size=500, overlap=50): words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i+chunk_size]) chunks.append(chunk) return chunks def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text("text") + "\n" return text def ensure_collection(): if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]: qdrant.create_collection( collection_name=COLLECTION_NAME, vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE) ) def ingest_pdfs(pdf_folder): ensure_collection() for filename in os.listdir(pdf_folder): if not filename.endswith(".pdf"): continue path = os.path.join(pdf_folder, filename) print(f"Processing {path}...") text = extract_text_from_pdf(path) chunks = chunk_text(text) payloads = [] vectors = [] for idx, chunk in enumerate(chunks): embedding = embed_text(chunk) payloads.append({ "source": f"{filename} (chunk {idx})", "text": chunk }) vectors.append(embedding) qdrant.upsert( collection_name=COLLECTION_NAME, points=models.Batch( ids=[f"{filename}-{i}" for i in range(len(chunks))], vectors=vectors, payloads=payloads ) ) if __name__ == "__main__": ingest_pdfs("./pdfs")