kotaemon/knowledgehub/embeddings/base.py

from __future__ import annotations

from abc import abstractmethod
from typing import Type

from langchain.schema.embeddings import Embeddings as LCEmbeddings
from theflow import Param

from ..base import BaseComponent
from ..documents.base import Document


class BaseEmbeddings(BaseComponent):
    @abstractmethod
    def run(
        self, text: str | list[str] | Document | list[Document]
    ) -> list[list[float]]:
        ...


class LangchainEmbeddings(BaseEmbeddings):
    _lc_class: Type[LCEmbeddings]

    def __init__(self, **params):
        if self._lc_class is None:
            raise AttributeError(
                "Should set _lc_class attribute to the LLM class from Langchain "
                "if using LLM from Langchain"
            )

        self._kwargs: dict = {}
        for param in list(params.keys()):
            if param in self._lc_class.__fields__:  # type: ignore
                self._kwargs[param] = params.pop(param)
        super().__init__(**params)

    def __setattr__(self, name, value):
        if name in self._lc_class.__fields__:
            self._kwargs[name] = value
        else:
            super().__setattr__(name, value)

    @Param.auto(cache=False)
    def agent(self):
        return self._lc_class(**self._kwargs)

    def run(self, text) -> list[list[float]]:
        input_: list[str] = []
        if not isinstance(text, list):
            text = [text]

        for item in text:
            if isinstance(item, str):
                input_.append(item)
            elif isinstance(item, Document):
                input_.append(item.text)
            else:
                raise ValueError(
                    f"Invalid input type {type(item)}, should be str or Document"
                )

        return self.agent.embed_documents(input_)