feat: set user being able to set chunk size and overlap for indices (#524)
* use tzlocal to get the local time * delete tmp folder * update date_created and date_updated with current timezone * pass precommit * update date_created field default by local time * add chunk size and chunk overlap param for indices * refactor code to pass pre-commit * fix: minor update logics --------- Co-authored-by: Tadashi <tadashi@cinnamon.is>
This commit is contained in:
parent
a1fecfac45
commit
32732c35de
|
@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
|
||||||
FSPath = Param(help="The file storage path")
|
FSPath = Param(help="The file storage path")
|
||||||
user_id = Param(help="The user id")
|
user_id = Param(help="The user id")
|
||||||
private = Param(False, help="Whether this is private index")
|
private = Param(False, help="Whether this is private index")
|
||||||
|
chunk_size = Param(help="Chunk size for this index")
|
||||||
|
chunk_overlap = Param(help="Chunk overlap for this index")
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
self, file_paths: str | Path | list[str | Path], *args, **kwargs
|
self, file_paths: str | Path | list[str | Path], *args, **kwargs
|
||||||
|
|
|
@ -404,6 +404,25 @@ class FileIndex(BaseIndex):
|
||||||
"choices": [("Yes", True), ("No", False)],
|
"choices": [("Yes", True), ("No", False)],
|
||||||
"info": "If private, files will not be accessible across users.",
|
"info": "If private, files will not be accessible across users.",
|
||||||
},
|
},
|
||||||
|
"chunk_size": {
|
||||||
|
"name": "Size of chunk (number of tokens)",
|
||||||
|
"value": 0,
|
||||||
|
"component": "number",
|
||||||
|
"info": (
|
||||||
|
"Number of tokens of each text segment. "
|
||||||
|
"Set 0 to use developer setting."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"chunk_overlap": {
|
||||||
|
"name": "Number of overlapping tokens between chunks",
|
||||||
|
"value": 0,
|
||||||
|
"component": "number",
|
||||||
|
"info": (
|
||||||
|
"Number of tokens that consecutive text segments "
|
||||||
|
"should overlap with each other. "
|
||||||
|
"Set 0 to use developer setting."
|
||||||
|
),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
|
def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
|
||||||
|
@ -423,6 +442,8 @@ class FileIndex(BaseIndex):
|
||||||
obj.FSPath = self._fs_path
|
obj.FSPath = self._fs_path
|
||||||
obj.user_id = user_id
|
obj.user_id = user_id
|
||||||
obj.private = self.config.get("private", False)
|
obj.private = self.config.get("private", False)
|
||||||
|
obj.chunk_size = self.config.get("chunk_size", 0)
|
||||||
|
obj.chunk_overlap = self.config.get("chunk_overlap", 0)
|
||||||
|
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
|
@ -729,7 +729,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
|
|
||||||
Can subclass this method for a more elaborate pipeline routing strategy.
|
Can subclass this method for a more elaborate pipeline routing strategy.
|
||||||
"""
|
"""
|
||||||
_, chunk_size, chunk_overlap = dev_settings()
|
|
||||||
|
_, dev_chunk_size, dev_chunk_overlap = dev_settings()
|
||||||
|
|
||||||
|
chunk_size = self.chunk_size or dev_chunk_size
|
||||||
|
chunk_overlap = self.chunk_overlap or dev_chunk_overlap
|
||||||
|
|
||||||
# check if file_path is a URL
|
# check if file_path is a URL
|
||||||
if self.is_url(file_path):
|
if self.is_url(file_path):
|
||||||
|
@ -744,12 +748,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
"the suitable pipeline for this file type in the settings."
|
"the suitable pipeline for this file type in the settings."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")
|
||||||
|
|
||||||
print("Using reader", reader)
|
print("Using reader", reader)
|
||||||
pipeline: IndexPipeline = IndexPipeline(
|
pipeline: IndexPipeline = IndexPipeline(
|
||||||
loader=reader,
|
loader=reader,
|
||||||
splitter=TokenSplitter(
|
splitter=TokenSplitter(
|
||||||
chunk_size=chunk_size or 1024,
|
chunk_size=chunk_size or 1024,
|
||||||
chunk_overlap=chunk_overlap if chunk_overlap is not None else 256,
|
chunk_overlap=chunk_overlap or 256,
|
||||||
separator="\n\n",
|
separator="\n\n",
|
||||||
backup_separators=["\n", ".", "\u200B"],
|
backup_separators=["\n", ".", "\u200B"],
|
||||||
),
|
),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user