import os
import docx
from docx import Document
import base64
from mimetypes import guess_type
import copy
class Word2Contents:
image_count=0
def __init__(self):
self.image_data=[]
self.partname_list =[]
def open(self, word_paths):
results =[]
if isinstance(word_paths, str):
word_paths = [word_paths]
for word_path in word_paths:
doc = Document(word_path)
rcontents, rtext = self.__parse_docx_main(doc, f"\n\n--- Content from {os.path.basename(word_path)} ---\n\n")
#results.append(self.__create_text_contents( f"\n\n--- Content from {os.path.basename(word_path)} ---\n\n"))
results.extend(rcontents)
results.append(self.__create_text_contents(rtext + "\n--- End of Content ---\n\n"))
for r in results:
if "text" in r:
print("results",r)
return results
def __create_text_contents(self, text):
content_parts = {"type": "text", "text": text}
return content_parts
def __parse_docx_main(self, doc, first_text):
results_contents = []
content = doc.iter_inner_content()
text = first_text
# 文書からテキストを抽出
for element in content:
if isinstance(element, docx.text.paragraph.Paragraph):
if self.__paragraph_contains_image(element):
# イメージの場合
rcontets, rtext = self.__handle_image_in_paragraph(doc, element, text)
results_contents.extend(rcontets)
text = rtext
else:
# テーブル外のテキストの場合
#results.append(self.__create_text_contents( element.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"))
text += element.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"
elif isinstance(element, docx.table.Table):
# 表の処理
#results.extend(self.__get_table_text(doc, element))
rcontets, rtext = self.__get_table_text(doc, element, text)
results_contents.extend(rcontets)
if 0 == len(rcontets):
text = rtext
else:
text += rtext
return results_contents, text
def __paragraph_contains_image(self, paragraph):
for run in paragraph.runs:
# DrawingML の画像があるかチェック
if run._element.xpath('.//a:blip'):
return True
return False
def __handle_image_in_paragraph(self, doc, paragraph, now_text):
results_contents = []
#paras = doc.paragraphs
#idx = paras.index(paragraph)
for run in paragraph.runs:
# DrawingML の画像を処理
blips = run._element.xpath('.//a:blip')
for blip in blips:
embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
# ここで画像を保存したり、テキストに置き換えたりする処理を追加
# 例: 画像を保存して、そのパスをテキストに追加
image_part = doc.part.related_parts[embed_id]
partname = image_part.partname
#同じ画像の時は処理を飛ばす。
if partname not in self.partname_list:
continue
self.partname_list.append(partname)
content_parts = {"type": "text", "text": now_text}
results_contents.append(content_parts)
# MIME タイプを取得
mime_type, _ = guess_type(partname)
# guess_typeで対応できない分を補完
if mime_type is None:
mime_map = {
".emf": "image/x-emf",
".wmf": "application/x-msmetafile",
}
ext = os.path.splitext(partname)[1].lower()
mime_type = mime_map.get(ext, "application/octet-stream")
# Base64化
b64 = base64.b64encode(image_part.blob).decode("utf-8")
image_content= {"type": "image", "base64": b64, "mime_type":mime_type}
results_contents.append(image_content)
now_text=""
return results_contents, now_text
# 画像処理関数へ
def __get_table_text(self,doc, element, now_text):
import docx
result_contents = []
cell_text = copy.deepcopy(now_text)
# テキストを結合して全体のテキストを作成
if isinstance(element, docx.table.Table):
# 表の処理
# これを追加してImagePartのインポートエラーを回避
#s.add(id(element))
for row in element.rows:
for cell in row.cells:
cell_text += "|"
for paragraph in cell.paragraphs:
if isinstance(paragraph, docx.text.paragraph.Paragraph):
if self.__paragraph_contains_image(paragraph):
rcontents , rttext = self.__handle_image_in_paragraph(doc, paragraph, cell_text)
result_contents.extend(rcontents)
cell_text = rttext
else:
#contents.append(self.__create_text_contents( paragraph.text.encode("utf-8", errors="replace").decode("utf-8") + "\n"))
cell_text+= paragraph.text.encode("utf-8", errors="replace").decode("utf-8")
elif isinstance(paragraph, docx.table.Table):
#contents.extend( self.__get_table_text(doc,element))
rcontets ,rtext = self.__get_table_text(doc, paragraph, cell_text)
result_contents.extend(rcontets)
if 0 == len(rcontets):
cell_text = rtext
else:
cell_text += rtext
cell_text += "|"
cell_text += "\n"
cell_text += "\n"
return result_contents, cell_text
def get_image_data(self):
return self.image_data