import base64
import requests
import json
from typing import List
from pydantic import BaseModel, conlist
# Define the response model using Pydantic
class ClipTaggerResponse(BaseModel):
description: str
objects: conlist(str, max_length=10)
actions: conlist(str, max_length=5)
environment: str
content_type: str
specific_style: str
production_quality: str
summary: str
logos: List[str]
# System and user prompts (use exactly as shown for best results)
SYSTEM_PROMPT = "You are an image annotation API trained to analyze YouTube video keyframes. You will be given instructions on the output format, what to caption, and how to perform your job. Follow those instructions. For descriptions and summaries, provide them directly and do not lead them with 'This image shows' or 'This keyframe displays...', just get right into the details."
USER_PROMPT = """
You are an image annotation API trained to analyze YouTube video keyframes. You must respond with a valid JSON object matching the exact structure below.
Your job is to extract detailed **factual elements directly visible** in the image. Do not speculate or interpret artistic intent, camera focus, or composition. Do not include phrases like "this appears to be", "this looks like", or anything about the image itself. Describe what **is physically present in the frame**, and nothing more.
Return JSON in this structure:
{
"description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
"objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
"actions": ["action1 with participants and context", "action2 with participants and context", ...],
"environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
"content_type": "The type of content it is, e.g. 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
"specific_style": "Specific genre, aesthetic, or platform style (e.e., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
"production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
"summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
"logos": ["logo1 with visual description", "logo2 with visual description", ...]
}
Rules:
- Be specific and literal. Focus on what is explicitly visible.
- Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
- No artistic or cinematic analysis.
- Always include the language of any text in the image if present as an object, e.g. "English text", "Japanese text", "Russian text", etc.
- Maximum 10 objects and 5 actions.
- Return an empty array for 'logos' if none are present.
- Always output strictly valid JSON with proper escaping.
- Output **only the JSON**, no extra text or explanation.
"""
# Function to encode image from URL
def encode_image_url(image_url):
response = requests.get(image_url)
return base64.b64encode(response.content).decode('utf-8')
# Example usage
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
base64_image = encode_image_url(image_url)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "text", "text": USER_PROMPT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
},
},
],
},
]
response = client.chat.completions.create(
model="inference-net/cliptagger-12b",
messages=messages,
temperature=0.1,
max_tokens=2000,
response_format={"type": "json_object"},
)
# Parse and validate the JSON response
raw_result = json.loads(response.choices[0].message.content)
result = ClipTaggerResponse(**raw_result) # This will raise ValidationError if the response doesn't match the schema
# Now 'result' is a typed Pydantic model instance
print(result.model_dump_json(indent=2))
# You can access typed properties
print(f"Description: {result.description}")
print(f"Objects found: {len(result.objects)}")