make default installation faster (#2)

* remove cohere as default

* refractor dependencies

* use llama-index pdf reader as default (pypdf)

* fix some lazy docstring

* update install scripts

* minor fix
This commit is contained in:
ian_Cin 2024-03-21 22:48:20 +07:00 committed by GitHub
parent a8f92b3f9e
commit d22ae88c7a
11 changed files with 30 additions and 33 deletions

View File

@ -89,7 +89,7 @@ jobs:
run: |
python -m pip install --upgrade pip
cd libs/kotaemon
pip install -U --upgrade-strategy eager -e .[dev]
pip install -U --upgrade-strategy eager -e .[all]
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
if: |

View File

@ -7,7 +7,6 @@ from kotaemon.base import BaseComponent, Document, Param
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
MathpixPDFReader,
OCRReader,
@ -59,7 +58,7 @@ class DocumentIngestor(BaseComponent):
file_extractors[ext] = cls()
if self.pdf_mode == "normal":
file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore
pass # use default loader of llama-index which is pypdf
elif self.pdf_mode == "ocr":
file_extractors[".pdf"] = OCRReader()
else:

View File

@ -55,7 +55,7 @@ class LIReaderMixin(BaseComponent):
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
raise NotImplementedError(
"Please return the relevant Langchain class in in _get_lc_class"
"Please return the relevant llama-index class in in _get_wrapped_class"
)
def __init__(self, *args, **kwargs):

View File

@ -33,7 +33,7 @@ class DocxReader(BaseReader):
"""Load data using Docx reader
Args:
file_path (Path): Path to PDF file
file_path (Path): Path to .docx file
Returns:
List[Document]: list of documents extracted from the HTML file

View File

@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
"""Load data using Html reader
Args:
file_path: path to pdf file
file_path: path to HTML file
extra_info: extra information passed to this reader during extracting data
Returns:

View File

@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development."
dependencies = [
"langchain",
"langchain-community",
"langchain-openai",
"openai",
"theflow",
"llama-index>=0.9.0,<0.10.0",
"llama-hub",
@ -27,6 +29,11 @@ dependencies = [
"pandas",
"trogon",
"tenacity",
"python-dotenv", # currently used to read configs from file, should be remove in the future
"chromadb",
"unstructured",
"pypdf",
"html2text",
]
readme = "README.md"
license = { text = "MIT License" }
@ -42,6 +49,18 @@ classifiers = [
]
[project.optional-dependencies]
adv = [
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"pytest-mock",
"unstructured[pdf]",
"sentence_transformers",
"cohere",
"elasticsearch",
"llama-cpp-python",
]
dev = [
"ipython",
"pytest",
@ -50,23 +69,8 @@ dev = [
"flake8",
"sphinx",
"coverage",
"openai",
"langchain-openai",
"chromadb",
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"python-dotenv",
"pytest-mock",
"unstructured[pdf]",
"sentence_transformers",
"cohere",
"elasticsearch",
"pypdf",
"html2text",
"llama-cpp-python",
]
all = ["kotaemon[adv,dev]"]
[project.scripts]
kh = "kotaemon.cli:main"

View File

@ -25,7 +25,7 @@ from theflow.utils.modules import import_dotted_string
from kotaemon.base import RetrievedDocument
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
from kotaemon.indices.rankings import BaseReranking, LLMReranking
from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
embedding=embeddings.get_default(),
)
reranker: BaseReranking = CohereReranking.withx(
cohere_api_key=getattr(settings, "COHERE_API_KEY", "")
) >> LLMReranking.withx(llm=llms.get_lowest_cost())
reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost())
get_extra_table: bool = False
def run(

View File

@ -13,18 +13,14 @@ version = "0.2.0"
requires-python = ">= 3.10"
description = "RAG-based Question and Answering Application"
dependencies = [
"chromadb",
"click",
"cohere",
"platformdirs",
"pluggy",
"python-decouple",
"python-dotenv",
"python-pptx",
"sqlalchemy",
"sqlmodel",
"tiktoken",
"unstructured[pdf]",
]
readme = "README.md"
license = { text = "MIT License" }

View File

@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed"
else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/"
echo "" && echo "Install kotaemon's requirements"

View File

@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed"
else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/"
echo "" && echo "Install kotaemon's requirements"

View File

@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 (
ECHO Dependencies are already installed
) ELSE (
ECHO Install kotaemon's requirements
CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]"
CALL python -m pip install -e "%CD%\libs\kotaemon"
ECHO Install ktem's requirements
CALL python -m pip install -e "%CD%\libs\ktem"