Multimodal AI models that understand both text and images have opened new possibilities for application development. From document understanding to visual question answering, these models enable powerful features. This guide covers practical implementation patterns for multimodal AI.
Image Understanding with Claude
// Image analysis with Claude Vision
import Anthropic from '@anthropic-ai/sdk';
import * as fs from 'fs';
const client = new Anthropic();
async function analyzeImage(
imagePath: string,
prompt: string
): Promise<string> {
const imageData = fs.readFileSync(imagePath);
const base64 = imageData.toString('base64');
const mediaType = imagePath.endsWith('.png') ? 'image/png' : 'image/jpeg';
const response = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 4096,
messages: [
{
role: 'user',
content: [
{
type: 'image',
source: {
type: 'base64',
media_type: mediaType,
data: base64,
},
},
{
type: 'text',
text: prompt,
},
],
},
],
});
return response.content[0].text;
}
// Use cases
// 1. Product description generation
const productDescription = await analyzeImage(
'./product.jpg',
`Analyze this product image and generate:
1. A compelling product title
2. Key features (bullet points)
3. A marketing description (2-3 sentences)
4. Suggested categories for e-commerce`
);
// 2. Accessibility: Image alt text
const altText = await analyzeImage(
'./hero-image.jpg',
`Generate concise, descriptive alt text for this image.
Focus on the main subject and important visual elements.
Keep it under 125 characters.`
);
// 3. Content moderation
const moderationResult = await analyzeImage(
'./user-upload.jpg',
`Analyze this image for content moderation.
Check for: violence, adult content, hate symbols, spam/scam indicators.
Return JSON: {"safe": boolean, "issues": string[], "confidence": number}`
);Document Processing
// Document understanding and extraction
// Invoice processing
async function processInvoice(invoicePath: string) {
const result = await analyzeImage(
invoicePath,
`Extract structured data from this invoice. Return JSON:
{
"vendor": {
"name": string,
"address": string,
"tax_id": string | null
},
"invoice_number": string,
"date": string (ISO format),
"due_date": string | null,
"line_items": [
{
"description": string,
"quantity": number,
"unit_price": number,
"total": number
}
],
"subtotal": number,
"tax": number,
"total": number,
"currency": string
}`
);
return JSON.parse(result);
}
// Multi-page document processing
async function processDocument(pagePaths: string[]) {
const pageContents = [];
for (const pagePath of pagePaths) {
const content = await analyzeImage(
pagePath,
`Extract all text content from this document page.
Preserve structure: headings, paragraphs, lists, tables.
Use markdown formatting.`
);
pageContents.push(content);
}
// Combine and analyze full document
const fullDocument = pageContents.join('\n\n---\n\n');
const summary = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 2048,
messages: [
{
role: 'user',
content: `Summarize this document:\n\n${fullDocument}`,
},
],
});
return {
pages: pageContents,
summary: summary.content[0].text,
};
}
// Form data extraction
async function extractFormData(formImagePath: string, formSchema: object) {
const result = await analyzeImage(
formImagePath,
`Extract data from this form into the following schema:
${JSON.stringify(formSchema, null, 2)}
For checkbox fields, use true/false.
For fields that are empty or unclear, use null.
Return valid JSON matching the schema.`
);
return JSON.parse(result);
}Comparing Multiple Images
// Multi-image analysis
async function compareImages(
imagePaths: string[],
comparisonPrompt: string
): Promise<string> {
const imageContents = imagePaths.map((path) => {
const data = fs.readFileSync(path).toString('base64');
return {
type: 'image' as const,
source: {
type: 'base64' as const,
media_type: 'image/jpeg' as const,
data,
},
};
});
const response = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 4096,
messages: [
{
role: 'user',
content: [
...imageContents,
{ type: 'text', text: comparisonPrompt },
],
},
],
});
return response.content[0].text;
}
// Use cases
// Product comparison
const comparison = await compareImages(
['./product-a.jpg', './product-b.jpg'],
`Compare these two products:
1. Visual differences (design, color, size)
2. Feature differences visible in images
3. Which appears more premium/higher quality?
4. Recommendation for different use cases`
);
// Before/after analysis
const progression = await compareImages(
['./before.jpg', './after.jpg'],
`Analyze the before and after images:
1. Key changes visible
2. Improvement areas
3. Remaining issues (if any)
4. Overall assessment`
);
// Visual similarity for recommendations
const similar = await compareImages(
['./user-style.jpg', './product-1.jpg', './product-2.jpg', './product-3.jpg'],
`The first image shows the user's preferred style.
Rank products 1-3 by how well they match this style.
Explain the reasoning for each ranking.`
);Building a Visual Search System
// Visual search with embeddings
import { Pinecone } from '@pinecone-database/pinecone';
const pinecone = new Pinecone();
const index = pinecone.index('visual-search');
// Generate image embeddings using CLIP or similar
async function getImageEmbedding(imagePath: string): Promise<number[]> {
// Use a vision embedding model (CLIP, OpenCLIP, etc.)
const response = await fetch('https://api.embeddings.example/v1/embed', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
image: fs.readFileSync(imagePath).toString('base64'),
model: 'clip-vit-large',
}),
});
const data = await response.json();
return data.embedding;
}
// Index product images
async function indexProducts(products: Product[]) {
const vectors = [];
for (const product of products) {
const embedding = await getImageEmbedding(product.imagePath);
vectors.push({
id: product.id,
values: embedding,
metadata: {
name: product.name,
category: product.category,
price: product.price,
imageUrl: product.imageUrl,
},
});
}
await index.upsert(vectors);
}
// Search by image
async function searchByImage(
imagePath: string,
filters?: object,
topK: number = 10
): Promise<SearchResult[]> {
const queryEmbedding = await getImageEmbedding(imagePath);
const results = await index.query({
vector: queryEmbedding,
topK,
filter: filters,
includeMetadata: true,
});
return results.matches.map((match) => ({
id: match.id,
score: match.score,
product: match.metadata,
}));
}
// Combine visual search with text understanding
async function intelligentSearch(
imagePath: string,
textQuery?: string
): Promise<SearchResult[]> {
// Get visual description
const description = await analyzeImage(
imagePath,
'Describe this item in detail: style, color, material, category'
);
// Get initial visual matches
const visualMatches = await searchByImage(imagePath, {}, 20);
// Refine with text if provided
if (textQuery) {
// Use LLM to filter/rank based on text query
const refinedResults = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 2048,
messages: [
{
role: 'user',
content: `User searched with an image described as: ${description}
They also specified: "${textQuery}"
Here are the visual search results:
${JSON.stringify(visualMatches, null, 2)}
Rerank these results based on how well they match both the visual similarity and the text query.
Return the reranked IDs as a JSON array.`,
},
],
});
const rankedIds = JSON.parse(refinedResults.content[0].text);
return rankedIds.map((id: string) =>
visualMatches.find((m) => m.id === id)
);
}
return visualMatches;
}Best Practices
Multimodal AI Best Practices
Image Quality:
- Ensure sufficient resolution (min 512px)
- Good lighting and contrast
- Compress appropriately for API limits
Prompting:
- Be specific about expected output format
- Use JSON schemas for structured extraction
- Provide examples for complex tasks
Performance:
- Cache embeddings for static images
- Batch process when possible
- Use appropriate model sizes
Safety:
- Implement content moderation
- Handle sensitive content appropriately
- Validate extracted data
Conclusion
Multimodal AI enables powerful new capabilities from document processing to visual search. The key is understanding when to use vision models versus traditional CV techniques, and how to combine them effectively.
Need help building multimodal AI applications? Contact Jishu Labs for expert AI consulting and development.
About Sarah Johnson
Sarah Johnson is the CTO at Jishu Labs with expertise in AI systems and multimodal applications.