from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
#import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
g_model_image = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
g_processor_image = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def get_image_embedding(image_path):
global g_model_image, g_processor_image
image = Image.open(image_path)
inputs = g_processor_image(images=image, return_tensors="pt")
embeddings = g_model_image.get_image_features(**inputs)
return embeddings.detach().numpy()
def get_image_data_embedding(image_data):
global g_model_image, g_processor_image
image = Image.fromarray(image_data)
inputs = g_processor_image(images=image, return_tensors="pt")
embeddings = g_model_image.get_image_features(**inputs)
return embeddings.detach().numpy()
# モデルとプロセッサをロード
g_processor_audio = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
g_model_aoudio = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
def get_audio_embedding(audio_path):
global g_model_aoudio, g_processor_audio
import librosa
audio_input, _ = librosa.load(audio_path, sr=16000)
inputs = g_processor_audio(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
embeddings = g_model_aoudio(**inputs).last_hidden_state
return embeddings.mean(dim=1).detach().numpy()
def get_audio_data_embedding(audio_data, sample_rate=16000):
import librosa
global g_model_aoudio, g_processor_audio
if sample_rate != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
inputs = g_processor_audio(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
embeddings = g_model_aoudio(**inputs).last_hidden_state
return embeddings.mean(dim=1).detach().numpy()
import pyautogui
import numpy as np
if "__main__" == __name__:
# スクリーンショットを撮影
screenshot = pyautogui.screenshot()
image_data = np.array(screenshot)
# 埋め込みベクトルを取得
embedding_vector = get_image_data_embedding(image_data)
print("Embedding shape:", embedding_vector.shape)
print("Embedding sample (first 10 dims):", embedding_vector[0][:10])