classHWPReader(BaseReader):"""Hwp Reader. Reads contents from Hwp file. Args: None. """def__init__(self,*args:Any,**kwargs:Any)->None:super().__init__(*args,**kwargs)self.FILE_HEADER_SECTION="FileHeader"self.HWP_SUMMARY_SECTION="\x05HwpSummaryInformation"self.SECTION_NAME_LENGTH=len("Section")self.BODYTEXT_SECTION="BodyText"self.HWP_TEXT_TAGS=[67]defload_data(self,file:Path,extra_info:Optional[Dict]=None)->List[Document]:"""Load data and extract table from Hwp file. Args: file (Path): Path for the Hwp file. Returns: List[Document]. """importolefileload_file=olefile.OleFileIO(file)file_dir=load_file.listdir()ifself.is_valid(file_dir)isFalse:raiseException("Not Valid HwpFile")result_text=self._get_text(load_file,file_dir)result=self._text_to_document(text=result_text,extra_info=extra_info)return[result]defis_valid(self,dirs):if[self.FILE_HEADER_SECTION]notindirs:returnFalsereturn[self.HWP_SUMMARY_SECTION]indirsdefget_body_sections(self,dirs):m=[]fordindirs:ifd[0]==self.BODYTEXT_SECTION:m.append(int(d[1][self.SECTION_NAME_LENGTH:]))return["BodyText/Section"+str(x)forxinsorted(m)]def_text_to_document(self,text:str,extra_info:Optional[Dict]=None)->Document:returnDocument(text=text,extra_info=extra_infoor{})defget_text(self):returnself.text# 전체 text 추출def_get_text(self,load_file,file_dir):sections=self.get_body_sections(file_dir)text=""forsectioninsections:text+=self.get_text_from_section(load_file,section)text+="\n"self.text=textreturnself.textdefis_compressed(self,load_file):header=load_file.openstream("FileHeader")header_data=header.read()return(header_data[36]&1)==1defget_text_from_section(self,load_file,section):bodytext=load_file.openstream(section)data=bodytext.read()unpacked_data=(zlib.decompress(data,-15)ifself.is_compressed(load_file)elsedata)size=len(unpacked_data)i=0text=""whilei<size:header=struct.unpack_from("<I",unpacked_data,i)[0]rec_type=header&0x3FF(header>>10)&0x3FFrec_len=(header>>20)&0xFFFifrec_typeinself.HWP_TEXT_TAGS:rec_data=unpacked_data[i+4:i+4+rec_len]text+=rec_data.decode("utf-16")text+="\n"i+=4+rec_lenreturntext
Source code in llama-index-integrations/readers/llama-index-readers-hwp/llama_index/readers/hwp/base.py
2324252627282930313233343536373839404142434445
defload_data(self,file:Path,extra_info:Optional[Dict]=None)->List[Document]:"""Load data and extract table from Hwp file. Args: file (Path): Path for the Hwp file. Returns: List[Document]. """importolefileload_file=olefile.OleFileIO(file)file_dir=load_file.listdir()ifself.is_valid(file_dir)isFalse:raiseException("Not Valid HwpFile")result_text=self._get_text(load_file,file_dir)result=self._text_to_document(text=result_text,extra_info=extra_info)return[result]