feat: support for visualizing citation results (via embeddings) (#461)

* feat:support for visualizing citation results (via embeddings) Signed-off-by: Kennywu <jdlow@live.cn> * fix: remove ktem dependency in visualize_cited * fix: limit onnx version for fastembed * fix: test case of indexing * fix: minor update * fix: chroma req * fix: chroma req --------- Signed-off-by: Kennywu <jdlow@live.cn> Co-authored-by: Tadashi <tadashi@cinnamon.is>
2024-11-05 15:02:57 +08:00
parent bd2490bef1
commit d127fec9f7
4 changed files with 196 additions and 5 deletions
--- a/libs/kotaemon/kotaemon/indices/vectorindex.py
+++ b/libs/kotaemon/kotaemon/indices/vectorindex.py
@@ -53,7 +53,11 @@ class VectorIndexing(BaseIndexing):
    def write_chunk_to_file(self, docs: list[Document]):
        # save the chunks content into markdown format
        if self.cache_dir:
-            file_name = Path(docs[0].metadata["file_name"])
+            file_name = docs[0].metadata.get("file_name")
+            if not file_name:
+                return
+
+            file_name = Path(file_name)
            for i in range(len(docs)):
                markdown_content = ""
                if "page_label" in docs[i].metadata:
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
    "langchain-cohere>=0.2.4,<0.3.0",
    "llama-hub>=0.0.79,<0.1.0",
    "llama-index>=0.10.40,<0.11.0",
+    "chromadb<=0.5.16",
    "llama-index-vector-stores-chroma>=0.1.9",
    "llama-index-vector-stores-lancedb",
    "openai>=1.23.6,<2",
@@ -52,7 +53,8 @@ dependencies = [
    "python-dotenv>=1.0.1,<1.1",
    "tenacity>=8.2.3,<8.3",
    "theflow>=0.8.6,<0.9.0",
-    "trogon>=0.5.0,<0.6"
+    "trogon>=0.5.0,<0.6",
+    "umap-learn==0.5.5",
 ]
 readme = "README.md"
 authors = [
@@ -71,6 +73,7 @@ adv = [
    "duckduckgo-search>=6.1.0,<6.2",
    "elasticsearch>=8.13.0,<8.14",
    "fastembed",
+    "onnxruntime<v1.20",
    "googlesearch-python>=1.2.4,<1.3",
    "llama-cpp-python<0.2.8",
    "llama-index>=0.10.40,<0.11.0",