Feat/local endpoint llm (#148)

* serve local model in a different process from the app --------- Co-authored-by: albert <albert@cinnamon.is> Co-authored-by: trducng <trungduc1992@gmail.com>
2024-03-15 16:17:33 +07:00
parent 2950e6ed02
commit df12dec732
20 changed files with 675 additions and 79 deletions
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@@ -118,7 +118,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):

        # rerank
        docs = self.vector_retrieval(text=text, top_k=top_k, **kwargs)
-        if self.get_from_path("reranker"):
+        if docs and self.get_from_path("reranker"):
            docs = self.reranker(docs, query=text)

        if not self.get_extra_table:
--- a/libs/ktem/ktem/reasoning/simple.py
+++ b/libs/ktem/ktem/reasoning/simple.py
@@ -200,24 +200,37 @@ class AnswerWithContextPipeline(BaseComponent):
            lang=self.lang,
        )

-        citation_task = asyncio.create_task(
-            self.citation_pipeline.ainvoke(context=evidence, question=question)
-        )
-        print("Citation task created")
+        if evidence:
+            citation_task = asyncio.create_task(
+                self.citation_pipeline.ainvoke(context=evidence, question=question)
+            )
+            print("Citation task created")

        messages = []
        if self.system_prompt:
            messages.append(SystemMessage(content=self.system_prompt))
        messages.append(HumanMessage(content=prompt))
+
        output = ""
-        for text in self.llm.stream(messages):
-            output += text.text
-            self.report_output({"output": text.text})
-            await asyncio.sleep(0)
+        try:
+            # try streaming first
+            print("Trying LLM streaming")
+            for text in self.llm.stream(messages):
+                output += text.text
+                self.report_output({"output": text.text})
+                await asyncio.sleep(0)
+        except NotImplementedError:
+            print("Streaming is not supported, falling back to normal processing")
+            output = self.llm(messages).text
+            self.report_output({"output": output})

        # retrieve the citation
        print("Waiting for citation task")
-        citation = await citation_task
+        if evidence:
+            citation = await citation_task
+        else:
+            citation = None
+
        answer = Document(text=output, metadata={"citation": citation})

        return answer