fastembed_common.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. from typing import Any, Optional, Union
  2. from pydantic import BaseModel, Field
  3. from qdrant_client.conversions.common_types import SparseVector
  4. from qdrant_client.http import models
  5. try:
  6. from fastembed import (
  7. TextEmbedding,
  8. SparseTextEmbedding,
  9. ImageEmbedding,
  10. LateInteractionTextEmbedding,
  11. LateInteractionMultimodalEmbedding,
  12. )
  13. from fastembed.common import OnnxProvider, ImageInput
  14. except ImportError:
  15. TextEmbedding = None
  16. SparseTextEmbedding = None
  17. ImageEmbedding = None
  18. LateInteractionTextEmbedding = None
  19. LateInteractionMultimodalEmbedding = None
  20. OnnxProvider = None
  21. ImageInput = None
  22. class QueryResponse(BaseModel, extra="forbid"): # type: ignore
  23. id: Union[str, int]
  24. embedding: Optional[list[float]]
  25. sparse_embedding: Optional[SparseVector] = Field(default=None)
  26. metadata: dict[str, Any]
  27. document: str
  28. score: float
  29. class FastEmbedMisc:
  30. IS_INSTALLED: bool = False
  31. _TEXT_MODELS: set[str] = set()
  32. _IMAGE_MODELS: set[str] = set()
  33. _LATE_INTERACTION_TEXT_MODELS: set[str] = set()
  34. _LATE_INTERACTION_MULTIMODAL_MODELS: set[str] = set()
  35. _SPARSE_MODELS: set[str] = set()
  36. @classmethod
  37. def is_installed(cls) -> bool:
  38. if cls.IS_INSTALLED:
  39. return cls.IS_INSTALLED
  40. try:
  41. from fastembed import (
  42. SparseTextEmbedding,
  43. TextEmbedding,
  44. ImageEmbedding,
  45. LateInteractionMultimodalEmbedding,
  46. LateInteractionTextEmbedding,
  47. )
  48. assert len(SparseTextEmbedding.list_supported_models()) > 0
  49. assert len(TextEmbedding.list_supported_models()) > 0
  50. assert len(ImageEmbedding.list_supported_models()) > 0
  51. assert len(LateInteractionTextEmbedding.list_supported_models()) > 0
  52. assert len(LateInteractionMultimodalEmbedding.list_supported_models()) > 0
  53. cls.IS_INSTALLED = True
  54. except ImportError:
  55. cls.IS_INSTALLED = False
  56. return cls.IS_INSTALLED
  57. @classmethod
  58. def import_fastembed(cls) -> None:
  59. if cls.IS_INSTALLED:
  60. return
  61. # If it's not, ask the user to install it
  62. raise ImportError(
  63. "fastembed is not installed."
  64. " Please install it to enable fast vector indexing with `pip install fastembed`."
  65. )
  66. @classmethod
  67. def list_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
  68. """Lists the supported dense text models.
  69. Requires invocation of TextEmbedding.list_supported_models() to support custom models.
  70. Returns:
  71. dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
  72. """
  73. return (
  74. {
  75. model["model"]: (model["dim"], models.Distance.COSINE)
  76. for model in TextEmbedding.list_supported_models()
  77. }
  78. if TextEmbedding
  79. else {}
  80. )
  81. @classmethod
  82. def list_image_models(cls) -> dict[str, tuple[int, models.Distance]]:
  83. """Lists the supported image dense models.
  84. Custom image models are not supported yet, but calls to ImageEmbedding.list_supported_models() is done each
  85. time in order for preserving the same style as with TextEmbedding.
  86. Returns:
  87. dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
  88. """
  89. return (
  90. {
  91. model["model"]: (model["dim"], models.Distance.COSINE)
  92. for model in ImageEmbedding.list_supported_models()
  93. }
  94. if ImageEmbedding
  95. else {}
  96. )
  97. @classmethod
  98. def list_late_interaction_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
  99. """Lists the supported late interaction text models.
  100. Custom late interaction models are not supported yet, but calls to
  101. LateInteractionTextEmbedding.list_supported_models()
  102. is done each time in order for preserving the same style as with TextEmbedding.
  103. Returns:
  104. dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
  105. """
  106. return (
  107. {
  108. model["model"]: (model["dim"], models.Distance.COSINE)
  109. for model in LateInteractionTextEmbedding.list_supported_models()
  110. }
  111. if LateInteractionTextEmbedding
  112. else {}
  113. )
  114. @classmethod
  115. def list_late_interaction_multimodal_models(cls) -> dict[str, tuple[int, models.Distance]]:
  116. """Lists the supported late interaction multimodal models.
  117. Custom late interaction multimodal models are not supported yet, but calls to
  118. LateInteractionMultimodalEmbedding.list_supported_models()
  119. is done each time in order for preserving the same style as with TextEmbedding.
  120. Returns:
  121. dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
  122. """
  123. return (
  124. {
  125. model["model"]: (model["dim"], models.Distance.COSINE)
  126. for model in LateInteractionMultimodalEmbedding.list_supported_models()
  127. }
  128. if LateInteractionMultimodalEmbedding
  129. else {}
  130. )
  131. @classmethod
  132. def list_sparse_models(cls) -> dict[str, dict[str, Any]]:
  133. """Lists the supported sparse models.
  134. Custom sparse models are not supported yet, but calls to
  135. SparseTextEmbedding.list_supported_models()
  136. is done each time in order for preserving the same style as with TextEmbedding.
  137. Returns:
  138. dict[str, dict[str, Any]]: A dict of model names and their descriptions.
  139. """
  140. descriptions = {}
  141. if SparseTextEmbedding:
  142. for description in SparseTextEmbedding.list_supported_models():
  143. descriptions[description.pop("model")] = description
  144. return descriptions
  145. @classmethod
  146. def is_supported_text_model(cls, model_name: str) -> bool:
  147. """Checks if the model is supported by fastembed.
  148. Args:
  149. model_name (str): The name of the model to check.
  150. Returns:
  151. bool: True if the model is supported, False otherwise.
  152. """
  153. if model_name.lower() in cls._TEXT_MODELS:
  154. return True
  155. # update cached list in case custom models were added
  156. cls._TEXT_MODELS = {model.lower() for model in cls.list_text_models()}
  157. if model_name.lower() in cls._TEXT_MODELS:
  158. return True
  159. return False
  160. @classmethod
  161. def is_supported_image_model(cls, model_name: str) -> bool:
  162. """Checks if the model is supported by fastembed.
  163. Args:
  164. model_name (str): The name of the model to check.
  165. Returns:
  166. bool: True if the model is supported, False otherwise.
  167. """
  168. if model_name.lower() in cls._IMAGE_MODELS:
  169. return True
  170. # update cached list in case custom models were added
  171. cls._IMAGE_MODELS = {model.lower() for model in cls.list_image_models()}
  172. if model_name.lower() in cls._IMAGE_MODELS:
  173. return True
  174. return False
  175. @classmethod
  176. def is_supported_late_interaction_text_model(cls, model_name: str) -> bool:
  177. """Checks if the model is supported by fastembed.
  178. Args:
  179. model_name (str): The name of the model to check.
  180. Returns:
  181. bool: True if the model is supported, False otherwise.
  182. """
  183. if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
  184. return True
  185. # update cached list in case custom models were added
  186. cls._LATE_INTERACTION_TEXT_MODELS = {
  187. model.lower() for model in cls.list_late_interaction_text_models()
  188. }
  189. if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
  190. return True
  191. return False
  192. @classmethod
  193. def is_supported_late_interaction_multimodal_model(cls, model_name: str) -> bool:
  194. """Checks if the model is supported by fastembed.
  195. Args:
  196. model_name (str): The name of the model to check.
  197. Returns:
  198. bool: True if the model is supported, False otherwise.
  199. """
  200. if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
  201. return True
  202. # update cached list in case custom models were added
  203. cls._LATE_INTERACTION_MULTIMODAL_MODELS = {
  204. model.lower() for model in cls.list_late_interaction_multimodal_models()
  205. }
  206. if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
  207. return True
  208. return False
  209. @classmethod
  210. def is_supported_sparse_model(cls, model_name: str) -> bool:
  211. """Checks if the model is supported by fastembed.
  212. Args:
  213. model_name (str): The name of the model to check.
  214. Returns:
  215. bool: True if the model is supported, False otherwise.
  216. """
  217. if model_name.lower() in cls._SPARSE_MODELS:
  218. return True
  219. # update cached list in case custom models were added
  220. cls._SPARSE_MODELS = {model.lower() for model in cls.list_sparse_models()}
  221. if model_name.lower() in cls._SPARSE_MODELS:
  222. return True
  223. return False
  224. # region deprecated
  225. # prefer using methods builtin into QdrantClient, e.g. list_supported_text_models, list_supported_idf_models, etc.
  226. SUPPORTED_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
  227. {
  228. model["model"]: (model["dim"], models.Distance.COSINE)
  229. for model in TextEmbedding.list_supported_models()
  230. }
  231. if TextEmbedding
  232. else {}
  233. )
  234. SUPPORTED_SPARSE_EMBEDDING_MODELS: dict[str, dict[str, Any]] = (
  235. {model["model"]: model for model in SparseTextEmbedding.list_supported_models()}
  236. if SparseTextEmbedding
  237. else {}
  238. )
  239. IDF_EMBEDDING_MODELS: set[str] = (
  240. {
  241. model_config["model"]
  242. for model_config in SparseTextEmbedding.list_supported_models()
  243. if model_config.get("requires_idf", None)
  244. }
  245. if SparseTextEmbedding
  246. else set()
  247. )
  248. _LATE_INTERACTION_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
  249. {
  250. model["model"]: (model["dim"], models.Distance.COSINE)
  251. for model in LateInteractionTextEmbedding.list_supported_models()
  252. }
  253. if LateInteractionTextEmbedding
  254. else {}
  255. )
  256. _IMAGE_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
  257. {
  258. model["model"]: (model["dim"], models.Distance.COSINE)
  259. for model in ImageEmbedding.list_supported_models()
  260. }
  261. if ImageEmbedding
  262. else {}
  263. )
  264. _LATE_INTERACTION_MULTIMODAL_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
  265. {
  266. model["model"]: (model["dim"], models.Distance.COSINE)
  267. for model in LateInteractionMultimodalEmbedding.list_supported_models()
  268. }
  269. if LateInteractionMultimodalEmbedding
  270. else {}
  271. )
  272. # endregion