Source code for lmitf.base_lvm

# %%
from __future__ import annotations

import base64
import io
import os
from typing import Any
from typing import Dict
from typing import Optional

from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
load_dotenv()
from wasabi import msg as wmsg
# %%
[docs] class BaseLVM: """ OpenAI LVM (Language Vision Model) 客户端封装类 提供对 OpenAI Vision API 的简化访问接口,支持图像处理和文本生成。 自动处理环境变量配置,维护调用历史记录。 Attributes ---------- client : openai.Image OpenAI 图像处理客户端实例 call_history : list[str | dict[str, Any]] API 调用响应的历史记录 """
[docs] def __init__(self, api_key: str | None = None, base_url: str | None = None): """ 初始化 VLM 客户端 Parameters ---------- api_key : str, optional OpenAI API 密钥。如果未提供,将从环境变量 OPENAI_API_KEY 读取 base_url : str, optional API 基础URL。如果未提供,将从环境变量 OPENAI_BASE_URL 读取 """ self.client = OpenAI( api_key=api_key or os.getenv('OPENAI_API_KEY'), base_url=base_url or os.getenv('OPENAI_BASE_URL'), )
[docs] def create( self, prompt: str, model: str = 'gpt-image-1', size: str = '1024x1024', )-> Image.Image: response = self.client.images.generate( model=model, prompt=prompt, size=size, ) b64_str = response.data[0].b64_json img_data = base64.b64decode(b64_str) image = Image.open(io.BytesIO(img_data)) return image
[docs] def edit( self, image: Image.Image, prompt: str, mask: Image.Image | None = None, model: str = 'gpt-image-1', size: str = '1024x1024', ) -> Image.Image: """ Edit an existing image with a prompt and optional mask. The image and mask (if provided) are sent as file-like objects. Returns the first edited image as a PIL Image. """ # Prepare image file img_buf = io.BytesIO() image.save(img_buf, format='PNG') img_buf.seek(0) # Prepare mask file if provided files: dict[str, Any] = {'image': img_buf} if mask: mask_buf = io.BytesIO() mask.save(mask_buf, format='PNG') mask_buf.seek(0) files['mask'] = mask_buf response = self.client.images.edit( model=model, prompt=prompt, size=size, **files, ) edited_b64 = response.data[0].b64_json edited_img = Image.open(io.BytesIO(base64.b64decode(edited_b64))) return edited_img
[docs] class AgentLVM():
[docs] def __init__(self, api_key: str | None = None, base_url: str | None = None): """ 初始化 Agent LVM 客户端 Parameters ---------- api_key : str, optional OpenAI API 密钥。如果未提供,将从环境变量 OPENAI_API_KEY 读取 base_url : str, optional API 基础URL。如果未提供,将从环境变量 OPENAI_BASE_URL 读取 """ self.client = OpenAI( api_key=api_key or os.getenv('OPENAI_API_KEY'), base_url=base_url or os.getenv('OPENAI_BASE_URL'), )
def _encode_img(self, image: Image.Image) -> str: img_buf = io.BytesIO() image.save(img_buf, format='PNG') img_buf.seek(0) return base64.b64encode(img_buf.read()).decode('utf-8') def _decode_img(self, img_b64: str) -> Image.Image: img_data = base64.b64decode(img_b64) return Image.open(io.BytesIO(img_data))
[docs] def create( self, msg: list[dict], model: str = 'gpt-4o', ) -> Image.Image: response = self.client.responses.create( model=model, input=msg, tools=[{'type': 'image_generation'}], tool_choice='required', ) image_generation_calls = [ output for output in response.output if output.type == 'image_generation_call' ] image_data = [output.result for output in image_generation_calls] if image_data: image_base64 = image_data[0] img_data = base64.b64decode(image_base64) image = Image.open(io.BytesIO(img_data)) return image else: raise ValueError(response.output.content)
[docs] def edit( self, prompt: str, image: Image.Image | list[Image.Image], model: str = 'gpt-4o', ) -> Image.Image: """ Edit an existing image with a prompt. The image is sent as a file-like object. Returns the first edited image as a PIL Image. """ if isinstance(image, Image.Image): image = [image] b64_images = [self._encode_img(img) for img in image] input = [ { 'role': 'user', 'content': [ {'type': 'input_text', 'text': prompt}, ], }, ] for img_b64 in b64_images: input[0]['content'].append( {'type': 'input_image', 'data': f"data:image/png;base64,{img_b64}"}, ) response = self.client.responses.create( model=model, input=input, tools=[{'type': 'image_generation'}], tool_choice='required', ) image_generation_calls = [ output for output in response.output if output.type == 'image_generation_call' ] image_data = [output.result for output in image_generation_calls] if image_data: image_base64 = image_data[0] img_data = base64.b64decode(image_base64) image = Image.open(io.BytesIO(img_data)) return image else: raise ValueError(response.output.content)
#%% if __name__ == '__main__': vlm = BaseLVM() res = vlm.create('A beautiful landscape with mountains and a river') #%% vlm = BaseLVM() res = vlm.edit(res, 'Add a rainbow in the sky') #%% a_lvm = AgentLVM() res = a_lvm.create( msg=[ {'role':'system','content':'You are a helpful assistant that generates images.'}, {'role':'user','content':'Generate an image of a futuristic city.'}, ], ) #%% character_ref = Image.open('/Users/zgh/Desktop/workingdir/AI-interface/lmitf/datasets/lvm_prompts/character_ref.png') a_lvm = AgentLVM() res = a_lvm.edit( image=character_ref, prompt='Make the character look more futuristic with neon lights and a cyberpunk style.', ) #%%