From d22ae88c7a52e4974cc0e77c6148cf19bb108918 Mon Sep 17 00:00:00 2001 From: ian_Cin Date: Thu, 21 Mar 2024 22:48:20 +0700 Subject: [PATCH] make default installation faster (#2) * remove cohere as default * refractor dependencies * use llama-index pdf reader as default (pypdf) * fix some lazy docstring * update install scripts * minor fix --- .github/workflows/unit-test.yaml | 2 +- .../kotaemon/indices/ingests/files.py | 3 +- libs/kotaemon/kotaemon/loaders/base.py | 2 +- libs/kotaemon/kotaemon/loaders/docx_loader.py | 2 +- libs/kotaemon/kotaemon/loaders/html_loader.py | 2 +- libs/kotaemon/pyproject.toml | 36 ++++++++++--------- libs/ktem/ktem/index/file/pipelines.py | 6 ++-- libs/ktem/pyproject.toml | 4 --- scripts/run_linux.sh | 2 +- scripts/run_macos.sh | 2 +- scripts/run_windows.bat | 2 +- 11 files changed, 30 insertions(+), 33 deletions(-) diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml index 82b9154..c94ad19 100644 --- a/.github/workflows/unit-test.yaml +++ b/.github/workflows/unit-test.yaml @@ -89,7 +89,7 @@ jobs: run: | python -m pip install --upgrade pip cd libs/kotaemon - pip install -U --upgrade-strategy eager -e .[dev] + pip install -U --upgrade-strategy eager -e .[all] - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} if: | diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 80044af..ed00e5c 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -7,7 +7,6 @@ from kotaemon.base import BaseComponent, Document, Param from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( - AutoReader, DirectoryReader, MathpixPDFReader, OCRReader, @@ -59,7 +58,7 @@ class DocumentIngestor(BaseComponent): file_extractors[ext] = cls() if self.pdf_mode == "normal": - file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore + pass # use default loader of llama-index which is pypdf elif self.pdf_mode == "ocr": file_extractors[".pdf"] = OCRReader() else: diff --git a/libs/kotaemon/kotaemon/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py index ca27e49..2e52f72 100644 --- a/libs/kotaemon/kotaemon/loaders/base.py +++ b/libs/kotaemon/kotaemon/loaders/base.py @@ -55,7 +55,7 @@ class LIReaderMixin(BaseComponent): def _get_wrapped_class(self) -> Type["LIBaseReader"]: raise NotImplementedError( - "Please return the relevant Langchain class in in _get_lc_class" + "Please return the relevant llama-index class in in _get_wrapped_class" ) def __init__(self, *args, **kwargs): diff --git a/libs/kotaemon/kotaemon/loaders/docx_loader.py b/libs/kotaemon/kotaemon/loaders/docx_loader.py index b8f77b8..dcec539 100644 --- a/libs/kotaemon/kotaemon/loaders/docx_loader.py +++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py @@ -33,7 +33,7 @@ class DocxReader(BaseReader): """Load data using Docx reader Args: - file_path (Path): Path to PDF file + file_path (Path): Path to .docx file Returns: List[Document]: list of documents extracted from the HTML file diff --git a/libs/kotaemon/kotaemon/loaders/html_loader.py b/libs/kotaemon/kotaemon/loaders/html_loader.py index fd0eddd..1295cfc 100644 --- a/libs/kotaemon/kotaemon/loaders/html_loader.py +++ b/libs/kotaemon/kotaemon/loaders/html_loader.py @@ -37,7 +37,7 @@ class HtmlReader(BaseReader): """Load data using Html reader Args: - file_path: path to pdf file + file_path: path to HTML file extra_info: extra information passed to this reader during extracting data Returns: diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 5abfb98..1ba6963 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development." dependencies = [ "langchain", "langchain-community", + "langchain-openai", + "openai", "theflow", "llama-index>=0.9.0,<0.10.0", "llama-hub", @@ -27,6 +29,11 @@ dependencies = [ "pandas", "trogon", "tenacity", + "python-dotenv", # currently used to read configs from file, should be remove in the future + "chromadb", + "unstructured", + "pypdf", + "html2text", ] readme = "README.md" license = { text = "MIT License" } @@ -42,6 +49,18 @@ classifiers = [ ] [project.optional-dependencies] +adv = [ + "wikipedia", + "duckduckgo-search", + "googlesearch-python", + "python-docx", + "pytest-mock", + "unstructured[pdf]", + "sentence_transformers", + "cohere", + "elasticsearch", + "llama-cpp-python", +] dev = [ "ipython", "pytest", @@ -50,23 +69,8 @@ dev = [ "flake8", "sphinx", "coverage", - "openai", - "langchain-openai", - "chromadb", - "wikipedia", - "duckduckgo-search", - "googlesearch-python", - "python-docx", - "python-dotenv", - "pytest-mock", - "unstructured[pdf]", - "sentence_transformers", - "cohere", - "elasticsearch", - "pypdf", - "html2text", - "llama-cpp-python", ] +all = ["kotaemon[adv,dev]"] [project.scripts] kh = "kotaemon.cli:main" diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index ff7c895..d15d2fb 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -25,7 +25,7 @@ from theflow.utils.modules import import_dotted_string from kotaemon.base import RetrievedDocument from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.indices.ingests import DocumentIngestor -from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking +from kotaemon.indices.rankings import BaseReranking, LLMReranking from .base import BaseFileIndexIndexing, BaseFileIndexRetriever @@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever): vector_retrieval: VectorRetrieval = VectorRetrieval.withx( embedding=embeddings.get_default(), ) - reranker: BaseReranking = CohereReranking.withx( - cohere_api_key=getattr(settings, "COHERE_API_KEY", "") - ) >> LLMReranking.withx(llm=llms.get_lowest_cost()) + reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost()) get_extra_table: bool = False def run( diff --git a/libs/ktem/pyproject.toml b/libs/ktem/pyproject.toml index c3160f1..6fee8a0 100644 --- a/libs/ktem/pyproject.toml +++ b/libs/ktem/pyproject.toml @@ -13,18 +13,14 @@ version = "0.2.0" requires-python = ">= 3.10" description = "RAG-based Question and Answering Application" dependencies = [ - "chromadb", "click", - "cohere", "platformdirs", "pluggy", "python-decouple", - "python-dotenv", "python-pptx", "sqlalchemy", "sqlmodel", "tiktoken", - "unstructured[pdf]", ] readme = "README.md" license = { text = "MIT License" } diff --git a/scripts/run_linux.sh b/scripts/run_linux.sh index 7298b87..8e2ea05 100755 --- a/scripts/run_linux.sh +++ b/scripts/run_linux.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_macos.sh b/scripts/run_macos.sh index 6ad9901..de71f75 100755 --- a/scripts/run_macos.sh +++ b/scripts/run_macos.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_windows.bat b/scripts/run_windows.bat index 4e5db86..e365686 100644 --- a/scripts/run_windows.bat +++ b/scripts/run_windows.bat @@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 ( ECHO Dependencies are already installed ) ELSE ( ECHO Install kotaemon's requirements - CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]" + CALL python -m pip install -e "%CD%\libs\kotaemon" ECHO Install ktem's requirements CALL python -m pip install -e "%CD%\libs\ktem"