ingest.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import os
  2. import fitz # PyMuPDF
  3. from qdrant_client import QdrantClient
  4. from qdrant_client.http import models
  5. import requests
  6. LOCALAI_URL = os.getenv("LOCALAI_URL", "http://localhost:8500")
  7. QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
  8. COLLECTION_NAME = "planning_docs"
  9. qdrant = QdrantClient(url=QDRANT_URL)
  10. def embed_text(text: str):
  11. r = requests.post(
  12. f"{LOCALAI_URL}/v1/embeddings",
  13. json={"model": "nomic-embed-text", "input": text}
  14. )
  15. resp = r.json()
  16. if "data" not in resp:
  17. print("Embedding error:", resp)
  18. raise ValueError(f"Embedding failed: {resp}")
  19. return resp["data"][0]["embedding"]
  20. def chunk_text(text, chunk_size=500, overlap=50):
  21. words = text.split()
  22. chunks = []
  23. for i in range(0, len(words), chunk_size - overlap):
  24. chunk = " ".join(words[i:i+chunk_size])
  25. chunks.append(chunk)
  26. return chunks
  27. def extract_text_from_pdf(pdf_path):
  28. doc = fitz.open(pdf_path)
  29. text = ""
  30. for page in doc:
  31. text += page.get_text("text") + "\n"
  32. return text
  33. def ensure_collection():
  34. if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
  35. qdrant.create_collection(
  36. collection_name=COLLECTION_NAME,
  37. vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
  38. )
  39. def ingest_pdfs(pdf_folder):
  40. ensure_collection()
  41. for filename in os.listdir(pdf_folder):
  42. if not filename.endswith(".pdf"):
  43. continue
  44. path = os.path.join(pdf_folder, filename)
  45. print(f"Processing {path}...")
  46. text = extract_text_from_pdf(path)
  47. chunks = chunk_text(text)
  48. payloads = []
  49. vectors = []
  50. for idx, chunk in enumerate(chunks):
  51. embedding = embed_text(chunk)
  52. payloads.append({
  53. "source": f"{filename} (chunk {idx})",
  54. "text": chunk
  55. })
  56. vectors.append(embedding)
  57. qdrant.upsert(
  58. collection_name=COLLECTION_NAME,
  59. points=models.Batch(
  60. ids=[f"{filename}-{i}" for i in range(len(chunks))],
  61. vectors=vectors,
  62. payloads=payloads
  63. )
  64. )
  65. if __name__ == "__main__":
  66. ingest_pdfs("./pdfs")