fix: update setup instructions (#144) #none

* activate directory to gitignore

* add my custom env to gitignore, will have to change that

* add unstructured to kotaemon pyproject.toml

* add .env to gitignore

* remove .env from tracking

* make changes to the run_macos script, update readme with more detailed instructions

* remove my personal changes from gitignore

* remove line from run_macos script

* remove option for not installing miniconda for non technical users, mark docker dependency as optional

* docs: update demo URL

* gitignore changes

* merge .env.example

* revert changes to run_macos.sh

* unstructured to advanced dependencies

* add link to unstructured system dependencies

* remove api key

* fix: skip tests when unstructured pdf not installed

* chore: loosen unstructured package version in pyproject.toml

* chore: correct syntax

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
Co-authored-by: cin-albert <albert@cinnamon.is>
This commit is contained in:
Ben Dykstra
2024-09-29 09:26:02 -06:00
committed by GitHub
parent 1522a3ab5a
commit f7b6f313b5
8 changed files with 42 additions and 16 deletions

View File

@@ -52,7 +52,7 @@ dependencies = [
"python-dotenv>=1.0.1,<1.1",
"tenacity>=8.2.3,<8.3",
"theflow>=0.8.6,<0.9.0",
"trogon>=0.5.0,<0.6",
"trogon>=0.5.0,<0.6"
]
readme = "README.md"
authors = [
@@ -73,11 +73,14 @@ adv = [
"fastembed",
"googlesearch-python>=1.2.4,<1.3",
"llama-cpp-python<0.2.8",
"sentence-transformers",
"wikipedia>=1.4.0,<1.5",
"llama-index>=0.10.40,<0.11.0",
"llama-index-vector-stores-milvus",
"llama-index-vector-stores-qdrant",
"python-docx>=1.1.0,<1.2",
"sentence-transformers",
"tabulate",
"unstructured>=0.15.8,<0.16",
"wikipedia>=1.4.0,<1.5",
]
dev = [
"black",

View File

@@ -42,9 +42,10 @@ def if_sentence_fastembed_not_installed():
return False
def if_unstructured_not_installed():
def if_unstructured_pdf_not_installed():
try:
import unstructured # noqa: F401
from unstructured.partition.pdf import partition_pdf # noqa: F401
except ImportError:
return True
else:
@@ -81,8 +82,8 @@ skip_when_fastembed_not_installed = pytest.mark.skipif(
if_sentence_fastembed_not_installed(), reason="fastembed is not installed"
)
skip_when_unstructured_not_installed = pytest.mark.skipif(
if_unstructured_not_installed(), reason="unstructured is not installed"
skip_when_unstructured_pdf_not_installed = pytest.mark.skipif(
if_unstructured_pdf_not_installed(), reason="unstructured is not installed"
)
skip_when_cohere_not_installed = pytest.mark.skipif(

View File

@@ -14,7 +14,7 @@ from kotaemon.loaders import (
UnstructuredReader,
)
from .conftest import skip_when_unstructured_not_installed
from .conftest import skip_when_unstructured_pdf_not_installed
def test_docx_reader():
@@ -54,7 +54,7 @@ def test_pdf_reader():
assert len(nodes) > 0
@skip_when_unstructured_not_installed
@skip_when_unstructured_pdf_not_installed
def test_unstructured_pdf_reader():
reader = UnstructuredReader()
dirpath = Path(__file__).parent

View File

@@ -5,7 +5,7 @@ import pytest
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
from .conftest import skip_when_unstructured_not_installed
from .conftest import skip_when_unstructured_pdf_not_installed
input_file = Path(__file__).parent / "resources" / "table.pdf"
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
@@ -28,7 +28,7 @@ def mathpix_output():
return content
@skip_when_unstructured_not_installed
@skip_when_unstructured_pdf_not_installed
def test_ocr_reader(fullocr_output):
reader = OCRReader()
documents = reader.load_data(input_file, response_content=fullocr_output)