| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import os
- import fitz # PyMuPDF
- from qdrant_client import QdrantClient
- from qdrant_client.http import models
- import requests
- LOCALAI_URL = os.getenv("LOCALAI_URL", "http://localhost:8500")
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
- COLLECTION_NAME = "planning_docs"
- qdrant = QdrantClient(url=QDRANT_URL)
- def embed_text(text: str):
- r = requests.post(
- f"{LOCALAI_URL}/v1/embeddings",
- json={"model": "nomic-embed-text", "input": text}
- )
- resp = r.json()
- if "data" not in resp:
- print("Embedding error:", resp)
- raise ValueError(f"Embedding failed: {resp}")
- return resp["data"][0]["embedding"]
- def chunk_text(text, chunk_size=500, overlap=50):
- words = text.split()
- chunks = []
- for i in range(0, len(words), chunk_size - overlap):
- chunk = " ".join(words[i:i+chunk_size])
- chunks.append(chunk)
- return chunks
- def extract_text_from_pdf(pdf_path):
- doc = fitz.open(pdf_path)
- text = ""
- for page in doc:
- text += page.get_text("text") + "\n"
- return text
- def ensure_collection():
- if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
- qdrant.create_collection(
- collection_name=COLLECTION_NAME,
- vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
- )
- def ingest_pdfs(pdf_folder):
- ensure_collection()
- for filename in os.listdir(pdf_folder):
- if not filename.endswith(".pdf"):
- continue
- path = os.path.join(pdf_folder, filename)
- print(f"Processing {path}...")
- text = extract_text_from_pdf(path)
- chunks = chunk_text(text)
- payloads = []
- vectors = []
- for idx, chunk in enumerate(chunks):
- embedding = embed_text(chunk)
- payloads.append({
- "source": f"{filename} (chunk {idx})",
- "text": chunk
- })
- vectors.append(embedding)
- qdrant.upsert(
- collection_name=COLLECTION_NAME,
- points=models.Batch(
- ids=[f"{filename}-{i}" for i in range(len(chunks))],
- vectors=vectors,
- payloads=payloads
- )
- )
- if __name__ == "__main__":
- ingest_pdfs("./pdfs")
|