classCodeSplitter(TextSplitter):"""Split code using a AST parser. Thank you to Kevin Lu / SweepAI for suggesting this elegant code splitting solution. https://docs.sweep.dev/blogs/chunking-2m-files """language:str=Field(description="The programming language of the code being split.")chunk_lines:int=Field(default=DEFAULT_CHUNK_LINES,description="The number of lines to include in each chunk.",gt=0,)chunk_lines_overlap:int=Field(default=DEFAULT_LINES_OVERLAP,description="How many lines of code each chunk overlaps with.",gt=0,)max_chars:int=Field(default=DEFAULT_MAX_CHARS,description="Maximum number of characters per chunk.",gt=0,)_parser:Any=PrivateAttr()def__init__(self,language:str,chunk_lines:int=DEFAULT_CHUNK_LINES,chunk_lines_overlap:int=DEFAULT_LINES_OVERLAP,max_chars:int=DEFAULT_MAX_CHARS,parser:Any=None,callback_manager:Optional[CallbackManager]=None,include_metadata:bool=True,include_prev_next_rel:bool=True,id_func:Optional[Callable[[int,Document],str]]=None,)->None:"""Initialize a CodeSplitter."""fromtree_sitterimportParser# pants: no-infer-depifparserisNone:try:importtree_sitter_languages# pants: no-infer-depparser=tree_sitter_languages.get_parser(language)exceptImportError:raiseImportError("Please install tree_sitter_languages to use CodeSplitter.""Or pass in a parser object.")exceptException:print(f"Could not get parser for language {language}. Check ""https://github.com/grantjenks/py-tree-sitter-languages#license ""for a list of valid languages.")raiseifnotisinstance(parser,Parser):raiseValueError("Parser must be a tree-sitter Parser object.")self._parser=parsercallback_manager=callback_managerorCallbackManager([])id_func=id_funcordefault_id_funcsuper().__init__(language=language,chunk_lines=chunk_lines,chunk_lines_overlap=chunk_lines_overlap,max_chars=max_chars,callback_manager=callback_manager,include_metadata=include_metadata,include_prev_next_rel=include_prev_next_rel,id_func=id_func,)@classmethoddeffrom_defaults(cls,language:str,chunk_lines:int=DEFAULT_CHUNK_LINES,chunk_lines_overlap:int=DEFAULT_LINES_OVERLAP,max_chars:int=DEFAULT_MAX_CHARS,callback_manager:Optional[CallbackManager]=None,parser:Any=None,)->"CodeSplitter":"""Create a CodeSplitter with default values."""returncls(language=language,chunk_lines=chunk_lines,chunk_lines_overlap=chunk_lines_overlap,max_chars=max_chars,parser=parser,)@classmethoddefclass_name(cls)->str:return"CodeSplitter"def_chunk_node(self,node:Any,text:str,last_end:int=0)->List[str]:new_chunks=[]current_chunk=""forchildinnode.children:ifchild.end_byte-child.start_byte>self.max_chars:# Child is too big, recursively chunk the childiflen(current_chunk)>0:new_chunks.append(current_chunk)current_chunk=""new_chunks.extend(self._chunk_node(child,text,last_end))elif(len(current_chunk)+child.end_byte-child.start_byte>self.max_chars):# Child would make the current chunk too big, so start a new chunknew_chunks.append(current_chunk)current_chunk=text[last_end:child.end_byte]else:current_chunk+=text[last_end:child.end_byte]last_end=child.end_byteiflen(current_chunk)>0:new_chunks.append(current_chunk)returnnew_chunksdefsplit_text(self,text:str)->List[str]:"""Split incoming code and return chunks using the AST."""withself.callback_manager.event(CBEventType.CHUNKING,payload={EventPayload.CHUNKS:[text]})asevent:tree=self._parser.parse(bytes(text,"utf-8"))if(nottree.root_node.childrenortree.root_node.children[0].type!="ERROR"):chunks=[chunk.strip()forchunkinself._chunk_node(tree.root_node,text)]event.on_end(payload={EventPayload.CHUNKS:chunks},)returnchunkselse:raiseValueError(f"Could not parse code with language {self.language}.")
@classmethoddeffrom_defaults(cls,language:str,chunk_lines:int=DEFAULT_CHUNK_LINES,chunk_lines_overlap:int=DEFAULT_LINES_OVERLAP,max_chars:int=DEFAULT_MAX_CHARS,callback_manager:Optional[CallbackManager]=None,parser:Any=None,)->"CodeSplitter":"""Create a CodeSplitter with default values."""returncls(language=language,chunk_lines=chunk_lines,chunk_lines_overlap=chunk_lines_overlap,max_chars=max_chars,parser=parser,)
defsplit_text(self,text:str)->List[str]:"""Split incoming code and return chunks using the AST."""withself.callback_manager.event(CBEventType.CHUNKING,payload={EventPayload.CHUNKS:[text]})asevent:tree=self._parser.parse(bytes(text,"utf-8"))if(nottree.root_node.childrenortree.root_node.children[0].type!="ERROR"):chunks=[chunk.strip()forchunkinself._chunk_node(tree.root_node,text)]event.on_end(payload={EventPayload.CHUNKS:chunks},)returnchunkselse:raiseValueError(f"Could not parse code with language {self.language}.")