Structured Element Chunker
Installation
# you can use a Conda environment
pip install --extra-index-url https://oauth2accesstoken:$(gcloud auth print-access-token)@glsdk.gdplabs.id/gen-ai-internal/simple/ "gllm-docproc"# you can use a Conda environment
$token = (gcloud auth print-access-token)
pip install --extra-index-url "https://oauth2accesstoken:$token@glsdk.gdplabs.id/gen-ai-internal/simple/" "gllm-docproc"# you can use a Conda environment
FOR /F "tokens=*" %T IN ('gcloud auth print-access-token') DO SET TOKEN=%T
pip install --extra-index-url "https://oauth2accesstoken:%TOKEN%@glsdk.gdplabs.id/gen-ai-internal/simple/" "gllm-docproc"1
import json
from gllm_docproc.chunker.structured_element import StructuredElementChunker
# elements (input) that you want to Chunk
with open('./data/source/parsed_elements.json', 'r') as file:
elements = json.load(file)
# initialize StructuredElementChunker
chunker = StructuredElementChunker()
# chunk elements
chunked_elements = chunker.chunk(elements)
print(chunked_elements)2
python main.py3
Customize Structured Element Chunker
import json
from typing import Any
from langchain_text_splitters import RecursiveCharacterTextSplitter
from gllm_docproc.chunker.structured_element import StructuredElementChunker
from gllm_docproc.chunker.table import MARKDOWN, TableChunker
from gllm_docproc.model.element import AUDIO, FOOTER, FOOTNOTE, HEADER, IMAGE, VIDEO, Element
# elements (input) that you want to Chunk
with open("./data/source/parsed_elements.json", "r") as file:
parsed_elements = json.load(file)
# initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n#", "\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""], chunk_size=1800, chunk_overlap=360
)
# initialize table chunker
table_chunker = TableChunker(chunk_size=4000, chunk_overlap=0, table_format=MARKDOWN)
# initialize StructuredElementChunker
chunker = StructuredElementChunker(
text_splitter=text_splitter, table_chunker=table_chunker, is_parent_structure_info_included=True
)
# initialize excluded structures
excluded_structures = [HEADER, FOOTER, FOOTNOTE, IMAGE, VIDEO, AUDIO]
# initialize enrich chunk function
def enrich_chunk(chunk: Element, elements: list[Element]) -> Element:
position: list[dict[str, Any]] = [
{
"coordinates": element.metadata.coordinates,
"page_number": element.metadata.page_number,
}
for element in elements
if hasattr(element.metadata, "coordinates") and hasattr(element.metadata, "page_number")
]
if position:
chunk.metadata.position = position
return chunk
# chunk elements
chunked_elements = chunker.chunk(parsed_elements, excluded_structures=excluded_structures, enrich_chunk=enrich_chunk)
Last updated
Was this helpful?