AI Agent:tools:word2contents.py:ソースコード

import os
import docx
from docx import Document
import base64
from mimetypes import guess_type
import copy
class Word2Contents:
    image_count=0
    def __init__(self):
        self.image_data=[]
        self.partname_list =[]

    def open(self, word_paths):
        
        results =[]
        if isinstance(word_paths, str):
            word_paths = [word_paths]

        for word_path in word_paths:
            doc = Document(word_path)

            rcontents, rtext = self.__parse_docx_main(doc, f"\n\n--- Content from {os.path.basename(word_path)} ---\n\n")

            #results.append(self.__create_text_contents( f"\n\n--- Content from {os.path.basename(word_path)} ---\n\n"))
            results.extend(rcontents) 
            results.append(self.__create_text_contents(rtext + "\n--- End of Content ---\n\n"))
        for r in results:
            if "text" in r:
               
                print("results",r)
        return results

    def __create_text_contents(self, text):
        content_parts = {"type": "text", "text": text}
        return content_parts
    
    def __parse_docx_main(self, doc, first_text):
        
        results_contents = []
        
        content = doc.iter_inner_content()
        text = first_text
        # 文書からテキストを抽出
        for element in content:
            if isinstance(element, docx.text.paragraph.Paragraph):
                if self.__paragraph_contains_image(element):
                    # イメージの場合
                    rcontets, rtext = self.__handle_image_in_paragraph(doc, element, text)
                    results_contents.extend(rcontets)
                    text = rtext
                else:

                    # テーブル外のテキストの場合
                    #results.append(self.__create_text_contents( element.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"))
                    text += element.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"
            elif isinstance(element, docx.table.Table):

                # 表の処理
                #results.extend(self.__get_table_text(doc, element))
                rcontets, rtext = self.__get_table_text(doc, element, text)
                results_contents.extend(rcontets)
                if 0 == len(rcontets):
                    text = rtext
                else:
                    text += rtext

           
        return results_contents, text
    
    def __paragraph_contains_image(self, paragraph):
        for run in paragraph.runs:
            # DrawingML の画像があるかチェック
            if run._element.xpath('.//a:blip'):
                return True
        return False
    
    def __handle_image_in_paragraph(self, doc, paragraph, now_text):
        results_contents = []

        #paras = doc.paragraphs
        #idx = paras.index(paragraph)

        for run in paragraph.runs:
            

            # DrawingML の画像を処理
            blips = run._element.xpath('.//a:blip')
            for blip in blips:
                embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
 
                # ここで画像を保存したり、テキストに置き換えたりする処理を追加
                # 例: 画像を保存して、そのパスをテキストに追加

                
                image_part = doc.part.related_parts[embed_id]
                partname = image_part.partname  
                #同じ画像の時は処理を飛ばす。
                
                if partname not in self.partname_list:
                    continue
                self.partname_list.append(partname)
                

                content_parts = {"type": "text", "text": now_text}
                results_contents.append(content_parts)
                # MIME タイプを取得

                mime_type, _ = guess_type(partname)
                # guess_typeで対応できない分を補完
                if mime_type is None:
                    mime_map = {
                        ".emf": "image/x-emf",
                        ".wmf": "application/x-msmetafile",
                    }

                    ext = os.path.splitext(partname)[1].lower()
                    mime_type = mime_map.get(ext, "application/octet-stream")


                # Base64化
                b64 = base64.b64encode(image_part.blob).decode("utf-8")
                image_content= {"type": "image", "base64": b64, "mime_type":mime_type}
                
                results_contents.append(image_content)


                now_text=""


                
        return results_contents, now_text
                # 画像処理関数へ
        
    def __get_table_text(self,doc, element, now_text):
        import docx
        result_contents = []
        cell_text = copy.deepcopy(now_text)
        # テキストを結合して全体のテキストを作成
        if isinstance(element, docx.table.Table):
            # 表の処理
             # これを追加してImagePartのインポートエラーを回避

        
            #s.add(id(element))
            for row in element.rows:
                for cell in row.cells:
                    cell_text += "|"
                    for paragraph in cell.paragraphs:
                        if isinstance(paragraph, docx.text.paragraph.Paragraph):

                            if self.__paragraph_contains_image(paragraph):

                                rcontents , rttext = self.__handle_image_in_paragraph(doc, paragraph, cell_text)
                                result_contents.extend(rcontents)
                                cell_text = rttext
                            else:
                                #contents.append(self.__create_text_contents( paragraph.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"))
                                cell_text+= paragraph.text.encode("utf-8", errors="replace").decode("utf-8") 
                        elif isinstance(paragraph, docx.table.Table):

                            #contents.extend( self.__get_table_text(doc,element))
                            rcontets ,rtext = self.__get_table_text(doc, paragraph, cell_text)
                            result_contents.extend(rcontets)
                            if 0 == len(rcontets):
                                cell_text = rtext
                            else:
                                cell_text += rtext
                       
                    cell_text += "|"
                cell_text += "\n"
            cell_text += "\n" 
            
        return result_contents, cell_text
    
    def get_image_data(self):
        return self.image_data