Chat Completions, Assistants API, Embeddings, Vision, Function Calling, Fine-Tuning, and Batch API.
The Chat Completions API is the primary interface for GPT models. It accepts messages (system, user, assistant) and returns model-generated responses.
from openai import OpenAI
client = OpenAI() # Uses OPENAI_API_KEY env variable
# ── Basic Chat Completion ──
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful Python tutor."},
{"role": "user", "content": "Explain list comprehensions."},
],
temperature=0.7,
max_tokens=500,
top_p=0.95,
frequency_penalty=0.0,
presence_penalty=0.0,
seed=42,
stop=["\n", "Q:"], # Stop sequences
)
print(response.choices[0].message.content)
print(f"Usage: prompt={response.usage.prompt_tokens}, "
f"completion={response.usage.completion_tokens}")
# ── Multi-Turn Conversation ──
messages = [
{"role": "system", "content": "You are a data scientist."},
{"role": "user", "content": "I have a CSV with sales data."},
{"role": "assistant", "content": "I can help analyze that. What columns do you have?"},
{"role": "user", "content": "Date, Product, Amount, Region"},
{"role": "assistant", "content": "Here's a pandas analysis template..."},
{"role": "user", "content": "How do I group by Region?"},
]
response = client.chat.completions.create(model="gpt-4o", messages=messages)
# ── Streaming Response ──
stream = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Write a short story."}],
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
# ── Async Client ──
from openai import AsyncOpenAI
async_client = AsyncOpenAI()
async def chat_async(prompt):
response = await async_client.chat.completions.create(
model="gpt-4o", messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content| Parameter | Type | Default | Description |
|---|---|---|---|
| model | string | required | Model ID: gpt-4o, gpt-4o-mini, o3-mini |
| messages | array | required | Array of message objects (system/user/assistant/tool) |
| temperature | float | 1.0 | Randomness (0=deterministic, 2=creative) |
| max_tokens | int | model default | Max tokens in completion |
| top_p | float | 1.0 | Nucleus sampling (0.1=focused, 1.0=diverse) |
| frequency_penalty | float | 0.0 | -2.0 to 2.0, reduce repetition of frequent tokens |
| presence_penalty | float | 0.0 | -2.0 to 2.0, encourage new topics |
| stop | string/array | null | Stop generation at these sequences |
| seed | int | null | For deterministic sampling (when possible) |
| response_format | object | null | {"type": "json_object"} or {"type": "text"} |
| tools | array | null | Available function definitions for tool calling |
| n | int | 1 | Number of chat completion choices to generate |
Function calling allows the model to generate structured arguments for calling external functions. This enables the LLM to interact with APIs, databases, and other tools.
import json
from openai import OpenAI
client = OpenAI()
# ── Define Tools ──
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["city"],
"additionalProperties": False,
},
"strict": True,
}
},
{
"type": "function",
"function": {
"name": "search_products",
"description": "Search product catalog",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"},
"category": {"type": "string", "enum": ["electronics", "clothing", "books"]},
"max_price": {"type": "number", "description": "Max price filter"},
},
"required": ["query"],
}
}
}
]
# ── Call with Tools ──
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "What is the weather in Paris?"}],
tools=tools,
tool_choice="auto",
)
# ── Handle Tool Call ──
message = response.choices[0].message
if message.tool_calls:
for tool_call in message.tool_calls:
function_name = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
print(f"Call: {function_name}({function_args})")
# Execute the function
if function_name == "get_weather":
result = get_weather(function_args["city"])
else:
result = "Function not found"
# Send result back to model
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": "What is the weather in Paris?"},
message, # Assistant message with tool_calls
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": str(result),
}
],
)
print(response.choices[0].message.content)| Pattern | tool_choice | Description | Use Case |
|---|---|---|---|
| Auto | "auto" | Model decides whether to call a tool | General use, uncertain if tool needed |
| Required | "required" | Model MUST call at least one tool | Data extraction, forced function use |
| None | "none" | Model will NOT call any tool | Pure conversation, no tool use |
| Specific | {"type": "function", "function": {"name": "fn"}} | Force a specific tool | When you know which function to use |
| Parallel | auto + multiple tools | Model calls multiple tools at once | Independent lookups (weather + stock price) |
The Embeddings API converts text into high-dimensional vector representations. These vectors capture semantic meaning and are used for search, clustering, and recommendation.
from openai import OpenAI
import numpy as np
client = OpenAI()
# ── Create Embeddings ──
response = client.embeddings.create(
model="text-embedding-3-small",
input=["Hello world", "Hi there", "Goodbye"],
encoding_format="float",
)
embeddings = [item.embedding for item in response.data]
print(f"Dimensions: {len(embeddings[0])}") # 1536
# ── Batch Embeddings (up to 2048 inputs) ──
texts = ["Text 1", "Text 2", ..., "Text 2000"]
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts,
)
# ── Cosine Similarity ──
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
sim = cosine_similarity(embeddings[0], embeddings[1])
print(f"Similarity: {sim:.4f}") # Close to 1.0 for similar texts
# ── Embedding Models Comparison ──
# text-embedding-3-small: 1536 dims, $0.02/1M tokens
# text-embedding-3-large: 3072 dims, $0.13/1M tokens
# text-embedding-ada-002: 1536 dims, $0.10/1M tokens (legacy)
# ── Reduced Dimensions (cost/performance tradeoff) ──
response = client.embeddings.create(
model="text-embedding-3-small",
input=["Hello world"],
dimensions=512, # Reduce from 1536 to 512
)
# ── Semantic Search Example ──
query_embedding = client.embeddings.create(
model="text-embedding-3-small",
input="machine learning basics"
).data[0].embedding
# Compare against document embeddings
similarities = [(text, cosine_similarity(query_embedding, doc_emb))
for text, doc_emb in zip(doc_texts, doc_embeddings)]
similarities.sort(key=lambda x: x[1], reverse=True)
print(f"Top match: {similarities[0][0]} (score: {similarities[0][1]:.4f})")Fine-tuning trains a model on your specific data to improve performance on your domain. The API handles training, validation, and deployment of fine-tuned models.
import json
from openai import OpenAI
client = OpenAI()
# ── Step 1: Prepare Training Data (JSONL) ──
training_data = [
{"messages": [
{"role": "system", "content": "You are a medical assistant."},
{"role": "user", "content": "What is hypertension?"},
{"role": "assistant", "content": "Hypertension (high blood pressure) is a condition..."}
]},
# ... 50-10,000+ examples
]
with open("medical_train.jsonl", "w") as f:
for item in training_data:
f.write(json.dumps(item) + "\n")
# Validation file (optional, 50-100 examples)
with open("medical_val.jsonl", "w") as f:
for item in validation_data:
f.write(json.dumps(item) + "\n")
# ── Step 2: Upload Files ──
train_file = client.files.create(file=open("medical_train.jsonl", "rb"),
purpose="fine-tune")
val_file = client.files.create(file=open("medical_val.jsonl", "rb"),
purpose="fine-tune")
# ── Step 3: Create Fine-Tuning Job ──
job = client.fine_tuning.jobs.create(
training_file=train_file.id,
model="gpt-4o-mini-2024-07-18",
hyperparameters={
"n_epochs": "auto",
"batch_size": "auto",
"learning_rate_multiplier": "auto",
},
validation_file=val_file.id,
suffix="medical-assistant",
)
print(f"Job ID: {job.id}")
# ── Step 4: Monitor ──
import time
while True:
job = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job.status}")
if job.status in ["succeeded", "failed"]:
break
time.sleep(60)
# ── Step 5: Use Fine-Tuned Model ──
response = client.chat.completions.create(
model=job.fine_tuned_model,
messages=[
{"role": "system", "content": "You are a medical assistant."},
{"role": "user", "content": "What causes diabetes?"},
],
)The Assistants API provides a higher-level interface for building AI assistants with built-in code execution, file search, and persistent threads.
from openai import OpenAI
client = OpenAI()
# ── Create Assistant ──
assistant = client.beta.assistants.create(
name="Data Analyst",
instructions="You analyze data files and create visualizations. "
"Use Python to process data and generate charts.",
model="gpt-4o",
tools=[
{"type": "code_interpreter"},
{"type": "file_search"},
],
)
# ── Upload File for Analysis ──
file = client.files.create(
file=open("sales_data.csv", "rb"),
purpose="assistants",
)
# ── Create Thread (conversation) ──
thread = client.beta.threads.create()
# ── Add Message ──
client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Analyze the sales data and show top products by revenue.",
attachments=[{"file_id": file.id, "tools": [{"type": "file_search"}]}],
)
# ── Run Assistant ──
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=assistant.id,
)
# ── Poll for Completion ──
while run.status in ["queued", "in_progress", "requires_action"]:
run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
if run.status == "requires_action":
# Handle tool calls (e.g., user confirmation)
pass
time.sleep(1)
# ── Get Messages ──
messages = client.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
print(f"{msg.role}: {msg.content[0].text.value[:200]}")| Component | Purpose | Features |
|---|---|---|
| Assistant | AI persona with instructions, model, and tools | Configurable model, system prompt, tools |
| Thread | Persistent conversation state | Maintains full message history across runs |
| Message | Individual user/assistant messages | Supports text, images, file attachments |
| Run | Execution of assistant on a thread | Queued, in_progress, completed, failed |
| Code Interpreter | Execute Python code in sandbox | File I/O, data analysis, chart generation |
| File Search | Search uploaded files for context | Automatic chunking, embedding, retrieval |
Understanding OpenAI pricing is essential for production applications. Costs are based on token usage (input + output tokens) and vary by model.
| Model | Input (1M tokens) | Output (1M tokens) | Context | Best For |
|---|---|---|---|---|
| GPT-4o | $2.50 | $10.00 | 128K | General purpose, multimodal |
| GPT-4o-mini | $0.15 | $0.60 | 128K | High volume, simple tasks |
| o3-mini | $1.10 | $4.40 | 200K | Reasoning, coding, math |
| o3 | $10.00 | $40.00 | 200K | Deep reasoning, complex problems |
| GPT-4.1 | $2.00 | $8.00 | 1M | Long context, instructions |
| GPT-4.1-mini | $0.40 | $1.60 | 1M | Fast, long context |
| text-embedding-3-small | $0.02 | - | - | Semantic search, clustering |
| text-embedding-3-large | $0.13 | - | - | Higher quality embeddings |
| whisper-1 | $0.006/min | - | - | Speech-to-text |
| tts-1 | $15.00/1M chars | - | - | Text-to-speech |
| gpt-4o-audio-preview | $2.50 | $10.00 | 128K | Audio in/out |
import tiktoken
# ── Token Counting ──
encoding = tiktoken.encoding_for_model("gpt-4o")
def count_tokens(text: str) -> int:
return len(encoding.encode(text))
text = "Hello, how are you today?"
print(f"Tokens: {count_tokens(text)}") # e.g., 7
# ── Estimate Cost ──
def estimate_cost(prompt: str, model: str = "gpt-4o") -> float:
"""Estimate input cost for a single prompt."""
pricing = {
"gpt-4o": 2.50 / 1_000_000,
"gpt-4o-mini": 0.15 / 1_000_000,
"o3-mini": 1.10 / 1_000_000,
}
tokens = count_tokens(prompt)
return tokens * pricing.get(model, 0.01 / 1000)
# ── Caching Strategy ──
from functools import lru_cache
@lru_cache(maxsize=1000)
def cached_completion(prompt_hash: str, model: str) -> str:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt_hash}],
)
return response.choices[0].message.content
# ── Prompt Compression Tips ──
# 1. Remove unnecessary system prompt words
# 2. Use shorter examples in few-shot
# 3. Truncate context to essential parts
# 4. Cache embeddings for repeated queries
# 5. Use GPT-4o-mini for simple classification tasksProduction applications must handle API errors gracefully, manage rate limits, and implement retries with exponential backoff.
from openai import OpenAI, RateLimitError, APIError, APIConnectionError
import time
import logging
client = OpenAI()
logger = logging.getLogger(__name__)
# ── Retry with Exponential Backoff ──
def chat_with_retry(messages, model="gpt-4o", max_retries=5):
"""Call API with automatic retry on rate limit errors."""
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=model, messages=messages,
timeout=30.0, # Request timeout
)
return response.choices[0].message.content
except RateLimitError as e:
wait = 2 ** attempt + 1 # 1, 3, 7, 15, 31 seconds
logger.warning(f"Rate limited. Retrying in {wait}s (attempt {attempt+1})")
time.sleep(wait)
except APIConnectionError:
logger.error("Connection error. Check network.")
time.sleep(5)
except APIError as e:
logger.error(f"API Error: {e}")
raise
raise Exception(f"Failed after {max_retries} retries")
# ── Content Moderation ──
def check_safety(text: str) -> dict:
"""Check text for harmful content."""
result = client.moderations.create(input=text)
flagged = result.results[0].flagged
categories = result.results[0].categories.model_dump()
if flagged:
logger.warning(f"Flagged: {categories}")
return {"safe": False, "categories": categories}
return {"safe": True}
# ── Async Batch Processing with Rate Limiting ──
import asyncio
from asyncio import Semaphore
async def process_batch(prompts, model="gpt-4o", concurrency=10):
"""Process multiple prompts with concurrency control."""
semaphore = Semaphore(concurrency)
async_client = AsyncOpenAI()
async def process_one(prompt):
async with semaphore:
response = await async_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
tasks = [process_one(p) for p in prompts]
return await asyncio.gather(*tasks)| Error | HTTP Code | Cause | Solution |
|---|---|---|---|
| RateLimitError | 429 | Too many requests | Exponential backoff, reduce concurrency |
| AuthenticationError | 401 | Invalid API key | Check OPENAI_API_KEY env variable |
| BadRequestError | 400 | Invalid request format | Check message format, model name |
| NotFoundError | 404 | Model/resource not found | Check model ID, file ID |
| TimeoutError | - | Request exceeded timeout | Increase timeout, use streaming |
| APIConnectionError | - | Network issue | Check internet, retry with backoff |
| ContextLengthExceeded | 400 | Input too long | Truncate messages, use longer context model |