diff --git a/knowledgehub/base/__init__.py b/knowledgehub/base/__init__.py index f83cbe5..7c600cc 100644 --- a/knowledgehub/base/__init__.py +++ b/knowledgehub/base/__init__.py @@ -1,3 +1,4 @@ from .component import BaseComponent +from .schema import Document -__all__ = ["BaseComponent"] +__all__ = ["BaseComponent", "Document"] diff --git a/knowledgehub/documents/base.py b/knowledgehub/base/schema.py similarity index 86% rename from knowledgehub/documents/base.py rename to knowledgehub/base/schema.py index b20f74a..648f5d0 100644 --- a/knowledgehub/documents/base.py +++ b/knowledgehub/base/schema.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, Any, Optional, TypeVar from llama_index.bridge.pydantic import Field @@ -72,3 +74,19 @@ class RetrievedDocument(Document): score: float = Field(default=0.0) retrieval_metadata: dict = Field(default={}) + + +class LLMInterface(Document): + candidates: list[str] = Field(default_factory=list) + completion_tokens: int = -1 + total_tokens: int = -1 + prompt_tokens: int = -1 + logits: list[list[float]] = Field(default_factory=list) + + +class ExtractorOutput(Document): + """ + Represents the output of an extractor. + """ + + matches: list[str] diff --git a/knowledgehub/chatbot/base.py b/knowledgehub/chatbot/base.py index bbfbd48..ac01675 100644 --- a/knowledgehub/chatbot/base.py +++ b/knowledgehub/chatbot/base.py @@ -5,7 +5,7 @@ from langchain.schema.messages import AIMessage, SystemMessage from theflow import Param, SessionCompose from ..base import BaseComponent -from ..llms.base import LLMInterface +from ..base.schema import LLMInterface from ..llms.chats.base import BaseMessage, HumanMessage diff --git a/knowledgehub/composite/branching.py b/knowledgehub/composite/branching.py index fb0b192..2c84a69 100644 --- a/knowledgehub/composite/branching.py +++ b/knowledgehub/composite/branching.py @@ -2,9 +2,8 @@ from typing import List, Optional from theflow import Param -from kotaemon.base import BaseComponent +from kotaemon.base import BaseComponent, Document from kotaemon.composite.linear import GatedLinearPipeline -from kotaemon.documents.base import Document class SimpleBranchingPipeline(BaseComponent): diff --git a/knowledgehub/composite/linear.py b/knowledgehub/composite/linear.py index 64c6d55..2f1cefd 100644 --- a/knowledgehub/composite/linear.py +++ b/knowledgehub/composite/linear.py @@ -1,7 +1,7 @@ from typing import Any, Callable, Optional, Union from kotaemon.base import BaseComponent -from kotaemon.documents.base import Document, IO_Type +from kotaemon.base.schema import Document, IO_Type from kotaemon.llms.chats.base import ChatLLM from kotaemon.llms.completions.base import LLM from kotaemon.prompt.base import BasePromptComponent diff --git a/knowledgehub/docstores/base.py b/knowledgehub/docstores/base.py index 14eb7ea..ce2a55d 100644 --- a/knowledgehub/docstores/base.py +++ b/knowledgehub/docstores/base.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from pathlib import Path from typing import List, Optional, Union -from ..documents.base import Document +from ..base import Document class BaseDocumentStore(ABC): diff --git a/knowledgehub/docstores/in_memory.py b/knowledgehub/docstores/in_memory.py index 577363e..339d735 100644 --- a/knowledgehub/docstores/in_memory.py +++ b/knowledgehub/docstores/in_memory.py @@ -2,7 +2,7 @@ import json from pathlib import Path from typing import List, Optional, Union -from ..documents.base import Document +from ..base import Document from .base import BaseDocumentStore diff --git a/knowledgehub/documents/__init__.py b/knowledgehub/documents/__init__.py deleted file mode 100644 index 8fbd1ea..0000000 --- a/knowledgehub/documents/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .base import Document - -__all__ = ["Document"] diff --git a/knowledgehub/embeddings/base.py b/knowledgehub/embeddings/base.py index e51aabf..688eb1c 100644 --- a/knowledgehub/embeddings/base.py +++ b/knowledgehub/embeddings/base.py @@ -6,8 +6,7 @@ from typing import Type from langchain.schema.embeddings import Embeddings as LCEmbeddings from theflow import Param -from ..base import BaseComponent -from ..documents.base import Document +from ..base import BaseComponent, Document class BaseEmbeddings(BaseComponent): diff --git a/knowledgehub/llms/base.py b/knowledgehub/llms/base.py deleted file mode 100644 index a2aa387..0000000 --- a/knowledgehub/llms/base.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import List - -from pydantic import Field - -from kotaemon.documents.base import Document - - -class LLMInterface(Document): - candidates: List[str] = Field(default_factory=list) - completion_tokens: int = -1 - total_tokens: int = -1 - prompt_tokens: int = -1 - logits: List[List[float]] = Field(default_factory=list) diff --git a/knowledgehub/llms/chats/base.py b/knowledgehub/llms/chats/base.py index 301b548..beed9f6 100644 --- a/knowledgehub/llms/chats/base.py +++ b/knowledgehub/llms/chats/base.py @@ -8,7 +8,7 @@ from langchain.schema.messages import BaseMessage, HumanMessage from theflow.base import Param from ...base import BaseComponent -from ..base import LLMInterface +from ...base.schema import LLMInterface logger = logging.getLogger(__name__) diff --git a/knowledgehub/llms/completions/base.py b/knowledgehub/llms/completions/base.py index 238f0f2..9f8cd4b 100644 --- a/knowledgehub/llms/completions/base.py +++ b/knowledgehub/llms/completions/base.py @@ -5,7 +5,7 @@ from langchain.llms.base import BaseLLM from theflow.base import Param from ...base import BaseComponent -from ..base import LLMInterface +from ...base.schema import LLMInterface logger = logging.getLogger(__name__) diff --git a/knowledgehub/loaders/base.py b/knowledgehub/loaders/base.py index 64dbd44..f74cc45 100644 --- a/knowledgehub/loaders/base.py +++ b/knowledgehub/loaders/base.py @@ -4,8 +4,7 @@ from typing import Any, List, Type, Union from llama_index import SimpleDirectoryReader, download_loader from llama_index.readers.base import BaseReader -from ..base import BaseComponent -from ..documents.base import Document +from ..base import BaseComponent, Document class AutoReader(BaseComponent): diff --git a/knowledgehub/loaders/excel_loader.py b/knowledgehub/loaders/excel_loader.py index dec358f..298481c 100644 --- a/knowledgehub/loaders/excel_loader.py +++ b/knowledgehub/loaders/excel_loader.py @@ -8,7 +8,7 @@ from typing import Any, List, Optional, Union from llama_index.readers.base import BaseReader -from kotaemon.documents import Document +from kotaemon.base import Document class PandasExcelReader(BaseReader): diff --git a/knowledgehub/loaders/mathpix_loader.py b/knowledgehub/loaders/mathpix_loader.py index cf85453..1fefe33 100644 --- a/knowledgehub/loaders/mathpix_loader.py +++ b/knowledgehub/loaders/mathpix_loader.py @@ -8,7 +8,7 @@ import requests from langchain.utils import get_from_dict_or_env from llama_index.readers.base import BaseReader -from kotaemon.documents import Document +from kotaemon.base import Document from .utils.table import parse_markdown_text_to_tables, strip_special_chars_markdown diff --git a/knowledgehub/loaders/ocr_loader.py b/knowledgehub/loaders/ocr_loader.py index efc8671..e2c2bc7 100644 --- a/knowledgehub/loaders/ocr_loader.py +++ b/knowledgehub/loaders/ocr_loader.py @@ -5,7 +5,7 @@ from uuid import uuid4 import requests from llama_index.readers.base import BaseReader -from kotaemon.documents import Document +from kotaemon.base import Document from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured from .utils.table import strip_special_chars_markdown diff --git a/knowledgehub/parsers/splitter.py b/knowledgehub/parsers/splitter.py index c9b1203..aa96a46 100644 --- a/knowledgehub/parsers/splitter.py +++ b/knowledgehub/parsers/splitter.py @@ -4,9 +4,7 @@ from llama_index.node_parser import SimpleNodeParser as LISimpleNodeParser from llama_index.node_parser.interface import NodeParser from llama_index.text_splitter import TokenTextSplitter -from kotaemon.base import BaseComponent - -from ..documents.base import Document +from ..base import BaseComponent, Document __all__ = ["TokenTextSplitter"] diff --git a/knowledgehub/pipelines/indexing.py b/knowledgehub/pipelines/indexing.py index 03e1d53..c98ca38 100644 --- a/knowledgehub/pipelines/indexing.py +++ b/knowledgehub/pipelines/indexing.py @@ -5,9 +5,8 @@ from pathlib import Path from theflow import Node, Param -from ..base import BaseComponent +from ..base import BaseComponent, Document from ..docstores import BaseDocumentStore -from ..documents.base import Document from ..embeddings import BaseEmbeddings from ..vectorstores import BaseVectorStore diff --git a/knowledgehub/pipelines/qa.py b/knowledgehub/pipelines/qa.py index b11a21e..d43bfa2 100644 --- a/knowledgehub/pipelines/qa.py +++ b/knowledgehub/pipelines/qa.py @@ -6,8 +6,8 @@ from theflow import Node from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent +from kotaemon.base.schema import RetrievedDocument from kotaemon.docstores import InMemoryDocumentStore -from kotaemon.documents.base import RetrievedDocument from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.llms.chats.openai import AzureChatOpenAI from kotaemon.pipelines.agents import BaseAgent diff --git a/knowledgehub/pipelines/retrieving.py b/knowledgehub/pipelines/retrieving.py index 5643f5e..47cb906 100644 --- a/knowledgehub/pipelines/retrieving.py +++ b/knowledgehub/pipelines/retrieving.py @@ -5,8 +5,8 @@ from pathlib import Path from theflow import Node, Param from ..base import BaseComponent +from ..base.schema import Document, RetrievedDocument from ..docstores import BaseDocumentStore -from ..documents.base import Document, RetrievedDocument from ..embeddings import BaseEmbeddings from ..vectorstores import BaseVectorStore diff --git a/knowledgehub/pipelines/tools/wikipedia.py b/knowledgehub/pipelines/tools/wikipedia.py index ef6b8d2..08990b8 100644 --- a/knowledgehub/pipelines/tools/wikipedia.py +++ b/knowledgehub/pipelines/tools/wikipedia.py @@ -2,7 +2,7 @@ from typing import Any, AnyStr, Optional, Type, Union from pydantic import BaseModel, Field -from kotaemon.documents.base import Document +from kotaemon.base import Document from .base import BaseTool diff --git a/knowledgehub/post_processing/extractor.py b/knowledgehub/post_processing/extractor.py index fbc2285..a269fb0 100644 --- a/knowledgehub/post_processing/extractor.py +++ b/knowledgehub/post_processing/extractor.py @@ -5,16 +5,8 @@ from typing import Callable from theflow import Param -from kotaemon.base import BaseComponent -from kotaemon.documents.base import Document - - -class ExtractorOutput(Document): - """ - Represents the output of an extractor. - """ - - matches: list[str] +from kotaemon.base import BaseComponent, Document +from kotaemon.base.schema import ExtractorOutput class RegexExtractor(BaseComponent): diff --git a/knowledgehub/prompt/base.py b/knowledgehub/prompt/base.py index 0459fc3..494f2ce 100644 --- a/knowledgehub/prompt/base.py +++ b/knowledgehub/prompt/base.py @@ -1,7 +1,6 @@ from typing import Callable, Union -from kotaemon.base import BaseComponent -from kotaemon.documents.base import Document +from kotaemon.base import BaseComponent, Document from kotaemon.prompt.template import PromptTemplate diff --git a/knowledgehub/vectorstores/base.py b/knowledgehub/vectorstores/base.py index 0760f8d..5df6792 100644 --- a/knowledgehub/vectorstores/base.py +++ b/knowledgehub/vectorstores/base.py @@ -6,7 +6,7 @@ from llama_index.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.types import VectorStore as LIVectorStore from llama_index.vector_stores.types import VectorStoreQuery -from ..documents.base import Document +from ..base import Document class BaseVectorStore(ABC): diff --git a/tests/test_docstores.py b/tests/test_docstores.py index d218211..db1da9e 100644 --- a/tests/test_docstores.py +++ b/tests/test_docstores.py @@ -1,7 +1,7 @@ import pytest +from kotaemon.base import Document from kotaemon.docstores import InMemoryDocumentStore -from kotaemon.documents.base import Document def test_simple_document_store_base_interfaces(tmp_path): diff --git a/tests/test_documents.py b/tests/test_documents.py index fcec590..234d750 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -1,4 +1,4 @@ -from kotaemon.documents.base import Document, RetrievedDocument +from kotaemon.base.schema import Document, RetrievedDocument from .conftest import skip_when_haystack_not_installed diff --git a/tests/test_indexing_retrieval.py b/tests/test_indexing_retrieval.py index 79f3bd7..6fb9ca6 100644 --- a/tests/test_indexing_retrieval.py +++ b/tests/test_indexing_retrieval.py @@ -5,8 +5,8 @@ from typing import cast import pytest from openai.resources.embeddings import Embeddings +from kotaemon.base import Document from kotaemon.docstores import InMemoryDocumentStore -from kotaemon.documents.base import Document from kotaemon.embeddings.openai import AzureOpenAIEmbeddings from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline diff --git a/tests/test_llms_chat_models.py b/tests/test_llms_chat_models.py index 932447b..fd3c4e4 100644 --- a/tests/test_llms_chat_models.py +++ b/tests/test_llms_chat_models.py @@ -4,7 +4,7 @@ from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage from openai.types.chat.chat_completion import ChatCompletion -from kotaemon.llms.base import LLMInterface +from kotaemon.base.schema import LLMInterface from kotaemon.llms.chats.openai import AzureChatOpenAI _openai_chat_completion_response = ChatCompletion.parse_obj( diff --git a/tests/test_llms_completion_models.py b/tests/test_llms_completion_models.py index ef001a5..04be9ba 100644 --- a/tests/test_llms_completion_models.py +++ b/tests/test_llms_completion_models.py @@ -4,7 +4,7 @@ from langchain.llms import AzureOpenAI as AzureOpenAILC from langchain.llms import OpenAI as OpenAILC from openai.types.completion import Completion -from kotaemon.llms.base import LLMInterface +from kotaemon.base.schema import LLMInterface from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI _openai_completion_response = Completion.parse_obj( diff --git a/tests/test_post_processing.py b/tests/test_post_processing.py index 8f14384..bda8337 100644 --- a/tests/test_post_processing.py +++ b/tests/test_post_processing.py @@ -1,6 +1,6 @@ import pytest -from kotaemon.documents.base import Document +from kotaemon.base import Document from kotaemon.post_processing.extractor import RegexExtractor diff --git a/tests/test_prompt.py b/tests/test_prompt.py index 6eb73c3..915c67b 100644 --- a/tests/test_prompt.py +++ b/tests/test_prompt.py @@ -1,6 +1,6 @@ import pytest -from kotaemon.documents.base import Document +from kotaemon.base import Document from kotaemon.post_processing.extractor import RegexExtractor from kotaemon.prompt.base import BasePromptComponent from kotaemon.prompt.template import PromptTemplate diff --git a/tests/test_reader.py b/tests/test_reader.py index 6b7bf8b..f65c25a 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -3,7 +3,7 @@ from pathlib import Path from langchain.schema import Document as LangchainDocument from llama_index.node_parser import SimpleNodeParser -from kotaemon.documents.base import Document +from kotaemon.base import Document from kotaemon.loaders import AutoReader diff --git a/tests/test_tools.py b/tests/test_tools.py index 771cb71..2f336f5 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -4,8 +4,8 @@ from pathlib import Path import pytest from openai.resources.embeddings import Embeddings +from kotaemon.base import Document from kotaemon.docstores import InMemoryDocumentStore -from kotaemon.documents.base import Document from kotaemon.embeddings.openai import AzureOpenAIEmbeddings from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 89fc3b2..0e9a9d0 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -1,6 +1,6 @@ import json -from kotaemon.documents.base import Document +from kotaemon.base import Document from kotaemon.vectorstores import ChromaVectorStore, InMemoryVectorStore