Add new OCRReader with PDF+OCR text merging (#66)

This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table). --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
2023-11-13 17:43:02 +07:00
parent d79b3744cb
commit 4704e2c11a
10 changed files with 523 additions and 126 deletions
--- a/knowledgehub/pipelines/ingest.py
+++ b/knowledgehub/pipelines/ingest.py
@@ -70,9 +70,11 @@ class ReaderIndexingPipeline(BaseComponent):
            embedding=self.embedding,
        )

-    text_splitter: SimpleNodeParser = SimpleNodeParser.withx(
-        chunk_size=1024, chunk_overlap=256
-    )
+    @Node.auto(depends_on=["chunk_size", "chunk_overlap"])
+    def text_splitter(self) -> SimpleNodeParser:
+        return SimpleNodeParser(
+            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
+        )

    def run(
        self,
--- a/knowledgehub/pipelines/qa.py
+++ b/knowledgehub/pipelines/qa.py
@@ -2,7 +2,7 @@ import os
 from pathlib import Path
 from typing import List

-from theflow import Node, Param
+from theflow import Node
 from theflow.utils.modules import ObjectInitDeclaration as _

 from kotaemon.base import BaseComponent
@@ -43,8 +43,8 @@ class QuestionAnsweringPipeline(BaseComponent):
        request_timeout=60,
    )

-    vector_store: Param[InMemoryVectorStore] = Param(_(InMemoryVectorStore))
-    doc_store: Param[InMemoryDocumentStore] = Param(_(InMemoryDocumentStore))
+    vector_store: _[InMemoryVectorStore] = _(InMemoryVectorStore)
+    doc_store: _[InMemoryDocumentStore] = _(InMemoryDocumentStore)

    embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
        model="text-embedding-ada-002",