Bases: BaseSplitter
, Generic[DocType]
Splitter for markdown documents with token-based chunking.
Splits markdown content into nodes based on document structure and
token limits. Supports node merging and splitting to maintain
consistent chunk sizes.
Source code in src/embedding/splitters/basic_markdown/basic_markdown_splitter.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156 | class BasicMarkdownSplitter(BaseSplitter, Generic[DocType]):
"""Splitter for markdown documents with token-based chunking.
Splits markdown content into nodes based on document structure and
token limits. Supports node merging and splitting to maintain
consistent chunk sizes.
"""
def __init__(
self,
chunk_size_in_tokens: int,
chunk_overlap_in_tokens: int,
tokenize_func: Callable,
):
"""Initialize markdown splitter.
Args:
chunk_size_in_tokens: Maximum tokens per chunk
chunk_overlap_in_tokens: Token overlap between chunks
tokenize_func: Function to tokenize text for token counting
"""
self.chunk_size_in_tokens = chunk_size_in_tokens
self.tokenize_func = tokenize_func
self.markdown_node_parser = MarkdownNodeParser()
self.sentence_splitter = SentenceSplitter(
chunk_size=chunk_size_in_tokens,
chunk_overlap=chunk_overlap_in_tokens,
tokenizer=tokenize_func,
)
def split(self, document: DocType) -> TextNode:
"""Split markdown documents into text nodes.
Split markdown document by markdown tags, then adjusts node sizes
through splitting large nodes and merging small nodes to optimize
for the target chunk size.
Args:
document: Markdown document to be processed
Returns:
List[TextNode]: Collection of processed text nodes with optimized sizes
"""
document_nodes = self.markdown_node_parser.get_nodes_from_documents(
[document]
)
document_nodes = self._split_big_nodes(document_nodes)
document_nodes = self._merge_small_nodes(document_nodes)
return document_nodes
def _split_big_nodes(
self, document_nodes: List[TextNode]
) -> List[TextNode]:
"""Split oversized nodes into smaller chunks.
Identifies nodes exceeding the token limit and processes them
through the sentence splitter to create smaller, semantically
coherent chunks.
Args:
document_nodes: Collection of nodes to process
Returns:
List[TextNode]: Processed nodes within token size limits
"""
new_document_nodes = []
for document_node in document_nodes:
text = document_node.text
document_node_size = len(self.tokenize_func(text))
if document_node_size > self.chunk_size_in_tokens:
document_sub_nodes = self._split_big_node(document_node)
new_document_nodes.extend(document_sub_nodes)
else:
new_document_nodes.append(document_node)
return new_document_nodes
def _split_big_node(self, document_node: TextNode) -> List[TextNode]:
"""Split single oversized node into smaller nodes.
Uses sentence boundary detection to create semantically meaningful
smaller chunks from a large node, preserving metadata from the
original node.
Args:
document_node: Node exceeding token size limit
Returns:
List[TextNode]: Collection of smaller nodes derived from original
"""
text = document_node.text
sub_texts = self.sentence_splitter.split_text(text)
sub_nodes = []
for sub_text in sub_texts:
sub_node = document_node.model_copy()
sub_node.id_ = str(uuid.uuid4())
sub_node.text = sub_text
sub_nodes.append(sub_node)
return sub_nodes
def _merge_small_nodes(
self, document_nodes: List[TextNode]
) -> List[TextNode]:
"""Merge adjacent small nodes into larger chunks.
Combines consecutive nodes when their combined token count remains
under the maximum limit, optimizing for fewer, larger chunks
while respecting token boundaries.
Args:
document_nodes: Collection of nodes to potentially merge
Returns:
List[TextNode]: Optimized collection with merged nodes
"""
new_document_nodes = []
current_node = document_nodes[0]
for node in document_nodes[1:]:
current_text = current_node.text
current_node_size = len(self.tokenize_func(current_text))
node_text = node.text
node_size = len(self.tokenize_func(node_text))
if current_node_size + node_size <= self.chunk_size_in_tokens:
current_node.text += node.text
else:
new_document_nodes.append(current_node)
current_node = node
new_document_nodes.append(current_node)
return new_document_nodes
|
__init__(chunk_size_in_tokens, chunk_overlap_in_tokens, tokenize_func)
Initialize markdown splitter.
Parameters: |
-
chunk_size_in_tokens
(int )
–
-
chunk_overlap_in_tokens
(int )
–
Token overlap between chunks
-
tokenize_func
(Callable )
–
Function to tokenize text for token counting
|
Source code in src/embedding/splitters/basic_markdown/basic_markdown_splitter.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48 | def __init__(
self,
chunk_size_in_tokens: int,
chunk_overlap_in_tokens: int,
tokenize_func: Callable,
):
"""Initialize markdown splitter.
Args:
chunk_size_in_tokens: Maximum tokens per chunk
chunk_overlap_in_tokens: Token overlap between chunks
tokenize_func: Function to tokenize text for token counting
"""
self.chunk_size_in_tokens = chunk_size_in_tokens
self.tokenize_func = tokenize_func
self.markdown_node_parser = MarkdownNodeParser()
self.sentence_splitter = SentenceSplitter(
chunk_size=chunk_size_in_tokens,
chunk_overlap=chunk_overlap_in_tokens,
tokenizer=tokenize_func,
)
|
split(document)
Split markdown documents into text nodes.
Split markdown document by markdown tags, then adjusts node sizes
through splitting large nodes and merging small nodes to optimize
for the target chunk size.
Parameters: |
-
document
(DocType )
–
Markdown document to be processed
|
Returns: |
-
TextNode
–
List[TextNode]: Collection of processed text nodes with optimized sizes
|
Source code in src/embedding/splitters/basic_markdown/basic_markdown_splitter.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | def split(self, document: DocType) -> TextNode:
"""Split markdown documents into text nodes.
Split markdown document by markdown tags, then adjusts node sizes
through splitting large nodes and merging small nodes to optimize
for the target chunk size.
Args:
document: Markdown document to be processed
Returns:
List[TextNode]: Collection of processed text nodes with optimized sizes
"""
document_nodes = self.markdown_node_parser.get_nodes_from_documents(
[document]
)
document_nodes = self._split_big_nodes(document_nodes)
document_nodes = self._merge_small_nodes(document_nodes)
return document_nodes
|