# %%
from __future__ import annotations
import base64
import io
import os
from typing import Any
from typing import Dict
from typing import Optional
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
load_dotenv()
from wasabi import msg as wmsg
# %%
[docs]
class BaseLVM:
"""
OpenAI LVM (Language Vision Model) 客户端封装类
提供对 OpenAI Vision API 的简化访问接口,支持图像处理和文本生成。
自动处理环境变量配置,维护调用历史记录。
Attributes
----------
client : openai.Image
OpenAI 图像处理客户端实例
call_history : list[str | dict[str, Any]]
API 调用响应的历史记录
"""
[docs]
def __init__(self, api_key: str | None = None, base_url: str | None = None):
"""
初始化 VLM 客户端
Parameters
----------
api_key : str, optional
OpenAI API 密钥。如果未提供,将从环境变量 OPENAI_API_KEY 读取
base_url : str, optional
API 基础URL。如果未提供,将从环境变量 OPENAI_BASE_URL 读取
"""
self.client = OpenAI(
api_key=api_key or os.getenv('OPENAI_API_KEY'),
base_url=base_url or os.getenv('OPENAI_BASE_URL'),
)
[docs]
def create(
self,
prompt: str,
model: str = 'gpt-image-1',
size: str = '1024x1024',
)-> Image.Image:
response = self.client.images.generate(
model=model,
prompt=prompt,
size=size,
)
b64_str = response.data[0].b64_json
img_data = base64.b64decode(b64_str)
image = Image.open(io.BytesIO(img_data))
return image
[docs]
def edit(
self,
image: Image.Image,
prompt: str,
mask: Image.Image | None = None,
model: str = 'gpt-image-1',
size: str = '1024x1024',
) -> Image.Image:
"""
Edit an existing image with a prompt and optional mask.
The image and mask (if provided) are sent as file-like objects.
Returns the first edited image as a PIL Image.
"""
# Prepare image file
img_buf = io.BytesIO()
image.save(img_buf, format='PNG')
img_buf.seek(0)
# Prepare mask file if provided
files: dict[str, Any] = {'image': img_buf}
if mask:
mask_buf = io.BytesIO()
mask.save(mask_buf, format='PNG')
mask_buf.seek(0)
files['mask'] = mask_buf
response = self.client.images.edit(
model=model,
prompt=prompt,
size=size,
**files,
)
edited_b64 = response.data[0].b64_json
edited_img = Image.open(io.BytesIO(base64.b64decode(edited_b64)))
return edited_img
[docs]
class AgentLVM():
[docs]
def __init__(self, api_key: str | None = None, base_url: str | None = None):
"""
初始化 Agent LVM 客户端
Parameters
----------
api_key : str, optional
OpenAI API 密钥。如果未提供,将从环境变量 OPENAI_API_KEY 读取
base_url : str, optional
API 基础URL。如果未提供,将从环境变量 OPENAI_BASE_URL 读取
"""
self.client = OpenAI(
api_key=api_key or os.getenv('OPENAI_API_KEY'),
base_url=base_url or os.getenv('OPENAI_BASE_URL'),
)
def _encode_img(self, image: Image.Image) -> str:
img_buf = io.BytesIO()
image.save(img_buf, format='PNG')
img_buf.seek(0)
return base64.b64encode(img_buf.read()).decode('utf-8')
def _decode_img(self, img_b64: str) -> Image.Image:
img_data = base64.b64decode(img_b64)
return Image.open(io.BytesIO(img_data))
[docs]
def create(
self,
msg: list[dict],
model: str = 'gpt-4o',
) -> Image.Image:
response = self.client.responses.create(
model=model,
input=msg,
tools=[{'type': 'image_generation'}],
tool_choice='required',
)
image_generation_calls = [
output
for output in response.output
if output.type == 'image_generation_call'
]
image_data = [output.result for output in image_generation_calls]
if image_data:
image_base64 = image_data[0]
img_data = base64.b64decode(image_base64)
image = Image.open(io.BytesIO(img_data))
return image
else:
raise ValueError(response.output.content)
[docs]
def edit(
self,
prompt: str,
image: Image.Image | list[Image.Image],
model: str = 'gpt-4o',
) -> Image.Image:
"""
Edit an existing image with a prompt.
The image is sent as a file-like object.
Returns the first edited image as a PIL Image.
"""
if isinstance(image, Image.Image):
image = [image]
b64_images = [self._encode_img(img) for img in image]
input = [
{
'role': 'user',
'content': [
{'type': 'input_text', 'text': prompt},
],
},
]
for img_b64 in b64_images:
input[0]['content'].append(
{'type': 'input_image', 'data': f"data:image/png;base64,{img_b64}"},
)
response = self.client.responses.create(
model=model,
input=input,
tools=[{'type': 'image_generation'}],
tool_choice='required',
)
image_generation_calls = [
output
for output in response.output
if output.type == 'image_generation_call'
]
image_data = [output.result for output in image_generation_calls]
if image_data:
image_base64 = image_data[0]
img_data = base64.b64decode(image_base64)
image = Image.open(io.BytesIO(img_data))
return image
else:
raise ValueError(response.output.content)
#%%
if __name__ == '__main__':
vlm = BaseLVM()
res = vlm.create('A beautiful landscape with mountains and a river')
#%%
vlm = BaseLVM()
res = vlm.edit(res, 'Add a rainbow in the sky')
#%%
a_lvm = AgentLVM()
res = a_lvm.create(
msg=[
{'role':'system','content':'You are a helpful assistant that generates images.'},
{'role':'user','content':'Generate an image of a futuristic city.'},
],
)
#%%
character_ref = Image.open('/Users/zgh/Desktop/workingdir/AI-interface/lmitf/datasets/lvm_prompts/character_ref.png')
a_lvm = AgentLVM()
res = a_lvm.edit(
image=character_ref,
prompt='Make the character look more futuristic with neon lights and a cyberpunk style.',
)
#%%