OpenAI API Cheatsheet Cheatsheet

💬

Chat Completions API

Core API

The Chat Completions API is the primary interface for GPT models. It accepts messages (system, user, assistant) and returns model-generated responses.

chat_completions.py

from openai import OpenAI
client = OpenAI()  # Uses OPENAI_API_KEY env variable

# ── Basic Chat Completion ──
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful Python tutor."},
        {"role": "user", "content": "Explain list comprehensions."},
    ],
    temperature=0.7,
    max_tokens=500,
    top_p=0.95,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    seed=42,
    stop=["\n", "Q:"],  # Stop sequences
)
print(response.choices[0].message.content)
print(f"Usage: prompt={response.usage.prompt_tokens}, "
      f"completion={response.usage.completion_tokens}")

# ── Multi-Turn Conversation ──
messages = [
    {"role": "system", "content": "You are a data scientist."},
    {"role": "user", "content": "I have a CSV with sales data."},
    {"role": "assistant", "content": "I can help analyze that. What columns do you have?"},
    {"role": "user", "content": "Date, Product, Amount, Region"},
    {"role": "assistant", "content": "Here's a pandas analysis template..."},
    {"role": "user", "content": "How do I group by Region?"},
]
response = client.chat.completions.create(model="gpt-4o", messages=messages)

# ── Streaming Response ──
stream = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Write a short story."}],
    stream=True,
)
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

# ── Async Client ──
from openai import AsyncOpenAI
async_client = AsyncOpenAI()
async def chat_async(prompt):
    response = await async_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

API Parameters Reference

Parameter	Type	Default	Description
model	string	required	Model ID: gpt-4o, gpt-4o-mini, o3-mini
messages	array	required	Array of message objects (system/user/assistant/tool)
temperature	float	1.0	Randomness (0=deterministic, 2=creative)
max_tokens	int	model default	Max tokens in completion
top_p	float	1.0	Nucleus sampling (0.1=focused, 1.0=diverse)
frequency_penalty	float	0.0	-2.0 to 2.0, reduce repetition of frequent tokens
presence_penalty	float	0.0	-2.0 to 2.0, encourage new topics
stop	string/array	null	Stop generation at these sequences
seed	int	null	For deterministic sampling (when possible)
response_format	object	null	{"type": "json_object"} or {"type": "text"}
tools	array	null	Available function definitions for tool calling
n	int	1	Number of chat completion choices to generate

🔧

Function Calling / Tool Use

Tool Integration

Function calling allows the model to generate structured arguments for calling external functions. This enables the LLM to interact with APIs, databases, and other tools.

function_calling.py

import json
from openai import OpenAI
client = OpenAI()

# ── Define Tools ──
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get current weather for a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "City name"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["city"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_products",
            "description": "Search product catalog",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string", "description": "Search query"},
                    "category": {"type": "string", "enum": ["electronics", "clothing", "books"]},
                    "max_price": {"type": "number", "description": "Max price filter"},
                },
                "required": ["query"],
            }
        }
    }
]

# ── Call with Tools ──
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "What is the weather in Paris?"}],
    tools=tools,
    tool_choice="auto",
)

# ── Handle Tool Call ──
message = response.choices[0].message
if message.tool_calls:
    for tool_call in message.tool_calls:
        function_name = tool_call.function.name
        function_args = json.loads(tool_call.function.arguments)
        print(f"Call: {function_name}({function_args})")

        # Execute the function
        if function_name == "get_weather":
            result = get_weather(function_args["city"])
        else:
            result = "Function not found"

        # Send result back to model
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": "What is the weather in Paris?"},
                message,  # Assistant message with tool_calls
                {
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": str(result),
                }
            ],
        )
        print(response.choices[0].message.content)

Tool Calling Patterns

Pattern	tool_choice	Description	Use Case
Auto	"auto"	Model decides whether to call a tool	General use, uncertain if tool needed
Required	"required"	Model MUST call at least one tool	Data extraction, forced function use
None	"none"	Model will NOT call any tool	Pure conversation, no tool use
Specific	{"type": "function", "function": {"name": "fn"}}	Force a specific tool	When you know which function to use
Parallel	auto + multiple tools	Model calls multiple tools at once	Independent lookups (weather + stock price)

🔢

Embeddings API

Vector Representations

The Embeddings API converts text into high-dimensional vector representations. These vectors capture semantic meaning and are used for search, clustering, and recommendation.

embeddings.py

from openai import OpenAI
import numpy as np
client = OpenAI()

# ── Create Embeddings ──
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=["Hello world", "Hi there", "Goodbye"],
    encoding_format="float",
)
embeddings = [item.embedding for item in response.data]
print(f"Dimensions: {len(embeddings[0])}")  # 1536

# ── Batch Embeddings (up to 2048 inputs) ──
texts = ["Text 1", "Text 2", ..., "Text 2000"]
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=texts,
)

# ── Cosine Similarity ──
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sim = cosine_similarity(embeddings[0], embeddings[1])
print(f"Similarity: {sim:.4f}")  # Close to 1.0 for similar texts

# ── Embedding Models Comparison ──
# text-embedding-3-small:  1536 dims, $0.02/1M tokens
# text-embedding-3-large:  3072 dims, $0.13/1M tokens
# text-embedding-ada-002:  1536 dims, $0.10/1M tokens (legacy)

# ── Reduced Dimensions (cost/performance tradeoff) ──
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=["Hello world"],
    dimensions=512,  # Reduce from 1536 to 512
)

# ── Semantic Search Example ──
query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input="machine learning basics"
).data[0].embedding

# Compare against document embeddings
similarities = [(text, cosine_similarity(query_embedding, doc_emb))
                for text, doc_emb in zip(doc_texts, doc_embeddings)]
similarities.sort(key=lambda x: x[1], reverse=True)
print(f"Top match: {similarities[0][0]} (score: {similarities[0][1]:.4f})")

🎯

Fine-Tuning API

Custom Models

Fine-tuning trains a model on your specific data to improve performance on your domain. The API handles training, validation, and deployment of fine-tuned models.

fine_tuning_api.py

import json
from openai import OpenAI
client = OpenAI()

# ── Step 1: Prepare Training Data (JSONL) ──
training_data = [
    {"messages": [
        {"role": "system", "content": "You are a medical assistant."},
        {"role": "user", "content": "What is hypertension?"},
        {"role": "assistant", "content": "Hypertension (high blood pressure) is a condition..."}
    ]},
    # ... 50-10,000+ examples
]

with open("medical_train.jsonl", "w") as f:
    for item in training_data:
        f.write(json.dumps(item) + "\n")

# Validation file (optional, 50-100 examples)
with open("medical_val.jsonl", "w") as f:
    for item in validation_data:
        f.write(json.dumps(item) + "\n")

# ── Step 2: Upload Files ──
train_file = client.files.create(file=open("medical_train.jsonl", "rb"),
                                 purpose="fine-tune")
val_file = client.files.create(file=open("medical_val.jsonl", "rb"),
                               purpose="fine-tune")

# ── Step 3: Create Fine-Tuning Job ──
job = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": "auto",
        "batch_size": "auto",
        "learning_rate_multiplier": "auto",
    },
    validation_file=val_file.id,
    suffix="medical-assistant",
)
print(f"Job ID: {job.id}")

# ── Step 4: Monitor ──
import time
while True:
    job = client.fine_tuning.jobs.retrieve(job.id)
    print(f"Status: {job.status}")
    if job.status in ["succeeded", "failed"]:
        break
    time.sleep(60)

# ── Step 5: Use Fine-Tuned Model ──
response = client.chat.completions.create(
    model=job.fine_tuned_model,
    messages=[
        {"role": "system", "content": "You are a medical assistant."},
        {"role": "user", "content": "What causes diabetes?"},
    ],
)

Fine-Tuning Best Practices

Minimum Data50 examples minimum, 500+ recommended. Quality matters more than quantity.

FormatJSONL with messages array. Include diverse examples covering your use cases.

EpochsUse "auto" to let OpenAI determine optimal epochs based on validation loss.

CostTraining: $8/1M tokens. Inference: same as base model. Budget accordingly.

When to Fine-TuneWhen you need specific style, tone, format, or domain knowledge that prompting cannot achieve. Not a replacement for RAG.

🤖

Assistants API

Stateful Agents

The Assistants API provides a higher-level interface for building AI assistants with built-in code execution, file search, and persistent threads.

assistants_api.py

from openai import OpenAI
client = OpenAI()

# ── Create Assistant ──
assistant = client.beta.assistants.create(
    name="Data Analyst",
    instructions="You analyze data files and create visualizations. "
                "Use Python to process data and generate charts.",
    model="gpt-4o",
    tools=[
        {"type": "code_interpreter"},
        {"type": "file_search"},
    ],
)

# ── Upload File for Analysis ──
file = client.files.create(
    file=open("sales_data.csv", "rb"),
    purpose="assistants",
)

# ── Create Thread (conversation) ──
thread = client.beta.threads.create()

# ── Add Message ──
client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="Analyze the sales data and show top products by revenue.",
    attachments=[{"file_id": file.id, "tools": [{"type": "file_search"}]}],
)

# ── Run Assistant ──
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
)

# ── Poll for Completion ──
while run.status in ["queued", "in_progress", "requires_action"]:
    run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
    if run.status == "requires_action":
        # Handle tool calls (e.g., user confirmation)
        pass
    time.sleep(1)

# ── Get Messages ──
messages = client.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
    print(f"{msg.role}: {msg.content[0].text.value[:200]}")

Assistants API Components

Component	Purpose	Features
Assistant	AI persona with instructions, model, and tools	Configurable model, system prompt, tools
Thread	Persistent conversation state	Maintains full message history across runs
Message	Individual user/assistant messages	Supports text, images, file attachments
Run	Execution of assistant on a thread	Queued, in_progress, completed, failed
Code Interpreter	Execute Python code in sandbox	File I/O, data analysis, chart generation
File Search	Search uploaded files for context	Automatic chunking, embedding, retrieval

💰

Pricing & Cost Optimization

API Costs

Understanding OpenAI pricing is essential for production applications. Costs are based on token usage (input + output tokens) and vary by model.

OpenAI Pricing (2025)

Model	Input (1M tokens)	Output (1M tokens)	Context	Best For
GPT-4o	$2.50	$10.00	128K	General purpose, multimodal
GPT-4o-mini	$0.15	$0.60	128K	High volume, simple tasks
o3-mini	$1.10	$4.40	200K	Reasoning, coding, math
o3	$10.00	$40.00	200K	Deep reasoning, complex problems
GPT-4.1	$2.00	$8.00	1M	Long context, instructions
GPT-4.1-mini	$0.40	$1.60	1M	Fast, long context
text-embedding-3-small	$0.02	-	-	Semantic search, clustering
text-embedding-3-large	$0.13	-	-	Higher quality embeddings
whisper-1	$0.006/min	-	-	Speech-to-text
tts-1	$15.00/1M chars	-	-	Text-to-speech
gpt-4o-audio-preview	$2.50	$10.00	128K	Audio in/out

cost_optimization.py

import tiktoken

# ── Token Counting ──
encoding = tiktoken.encoding_for_model("gpt-4o")

def count_tokens(text: str) -> int:
    return len(encoding.encode(text))

text = "Hello, how are you today?"
print(f"Tokens: {count_tokens(text)}")  # e.g., 7

# ── Estimate Cost ──
def estimate_cost(prompt: str, model: str = "gpt-4o") -> float:
    """Estimate input cost for a single prompt."""
    pricing = {
        "gpt-4o": 2.50 / 1_000_000,
        "gpt-4o-mini": 0.15 / 1_000_000,
        "o3-mini": 1.10 / 1_000_000,
    }
    tokens = count_tokens(prompt)
    return tokens * pricing.get(model, 0.01 / 1000)

# ── Caching Strategy ──
from functools import lru_cache

@lru_cache(maxsize=1000)
def cached_completion(prompt_hash: str, model: str) -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt_hash}],
    )
    return response.choices[0].message.content

# ── Prompt Compression Tips ──
# 1. Remove unnecessary system prompt words
# 2. Use shorter examples in few-shot
# 3. Truncate context to essential parts
# 4. Cache embeddings for repeated queries
# 5. Use GPT-4o-mini for simple classification tasks

⚠️

Error Handling & Rate Limits

Production Ready

Production applications must handle API errors gracefully, manage rate limits, and implement retries with exponential backoff.

error_handling.py

from openai import OpenAI, RateLimitError, APIError, APIConnectionError
import time
import logging

client = OpenAI()
logger = logging.getLogger(__name__)

# ── Retry with Exponential Backoff ──
def chat_with_retry(messages, model="gpt-4o", max_retries=5):
    """Call API with automatic retry on rate limit errors."""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model, messages=messages,
                timeout=30.0,  # Request timeout
            )
            return response.choices[0].message.content
        except RateLimitError as e:
            wait = 2 ** attempt + 1  # 1, 3, 7, 15, 31 seconds
            logger.warning(f"Rate limited. Retrying in {wait}s (attempt {attempt+1})")
            time.sleep(wait)
        except APIConnectionError:
            logger.error("Connection error. Check network.")
            time.sleep(5)
        except APIError as e:
            logger.error(f"API Error: {e}")
            raise
    raise Exception(f"Failed after {max_retries} retries")

# ── Content Moderation ──
def check_safety(text: str) -> dict:
    """Check text for harmful content."""
    result = client.moderations.create(input=text)
    flagged = result.results[0].flagged
    categories = result.results[0].categories.model_dump()
    if flagged:
        logger.warning(f"Flagged: {categories}")
        return {"safe": False, "categories": categories}
    return {"safe": True}

# ── Async Batch Processing with Rate Limiting ──
import asyncio
from asyncio import Semaphore

async def process_batch(prompts, model="gpt-4o", concurrency=10):
    """Process multiple prompts with concurrency control."""
    semaphore = Semaphore(concurrency)
    async_client = AsyncOpenAI()

    async def process_one(prompt):
        async with semaphore:
            response = await async_client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
            )
            return response.choices[0].message.content

    tasks = [process_one(p) for p in prompts]
    return await asyncio.gather(*tasks)

Common API Errors

Error	HTTP Code	Cause	Solution
RateLimitError	429	Too many requests	Exponential backoff, reduce concurrency
AuthenticationError	401	Invalid API key	Check OPENAI_API_KEY env variable
BadRequestError	400	Invalid request format	Check message format, model name
NotFoundError	404	Model/resource not found	Check model ID, file ID
TimeoutError	-	Request exceeded timeout	Increase timeout, use streaming
APIConnectionError	-	Network issue	Check internet, retry with backoff
ContextLengthExceeded	400	Input too long	Truncate messages, use longer context model

💬

Interview Questions

Top 8

Q1: How does function calling work?

AnswerThe model generates a tool_calls object containing the function name and JSON arguments instead of a text response. Your code executes the function and sends the result back as a "tool" message. The model then generates a natural language response based on the tool result. This enables structured, deterministic integration with external systems.

Q2: Temperature vs Top-P

AnswerTemperature controls overall randomness (0=deterministic, 2=very random). Top-P (nucleus sampling) selects from the smallest set of tokens whose cumulative probability exceeds P. For factual tasks: temperature=0.1-0.3. For creative tasks: temperature=0.7-1.0. Using both together is redundant; pick one strategy. Generally, just set temperature and leave top_p at 1.0.

Q3: Streaming vs non-streaming

AnswerNon-streaming: waits for full response before returning. Higher latency, simpler code. Streaming: returns chunks as they are generated. Lower perceived latency, better UX for long responses. Use streaming for any user-facing application with responses longer than a few seconds. Use non-streaming for batch processing or backend operations.

Q4: How to reduce API costs?

Answer1) Use GPT-4o-mini for simple tasks (3-5x cheaper). 2) Cache repeated queries and embeddings. 3) Compress prompts (remove filler words). 4) Use max_tokens to limit output. 5) Batch similar requests. 6) Use fine-tuned GPT-4o-mini instead of GPT-4o for domain tasks. 7) Implement response caching for common queries.

Q5: JSON mode vs structured output

AnswerJSON mode (response_format={"type": "json_object"}): guarantees valid JSON but not schema compliance. Must mention "JSON" in prompt. Structured output (2025): define a JSON schema, model guarantees output matches schema. Use structured output for production APIs where format matters. JSON mode for simpler cases.

Q6: Token counting importance

AnswerTokens determine: cost (per 1M tokens), context window usage, max output length. 1 token is approximately 4 characters in English, or 0.75 words. Use tiktoken library to count tokens before API calls. Plan: input tokens (prompt) + output tokens (completion) must fit within model context window. Monitor usage via response.usage.

Q7: Rate limits and how to handle them

AnswerOpenAI enforces TPM (tokens per minute) and RPM (requests per minute) per API key. RateLimitError (429) occurs when exceeded. Handle with: 1) Exponential backoff (wait 2^n seconds). 2) Batch requests with delays. 3) Use multiple API keys for load distribution. 4) Implement client-side queuing. 5) Upgrade to higher tier for increased limits.

Q8: Assistants vs Chat Completions

AnswerChat Completions: stateless, you manage conversation history, lower-level, full control. Assistants: stateful threads managed by OpenAI, built-in code interpreter and file search, simpler for common patterns. Use Chat Completions for production (more control, lower cost, no vendor lock-in). Use Assistants for rapid prototyping or when you need code execution/file search.

⏳

Loading cheatsheet...

from openai import OpenAI client = OpenAI() # Uses OPENAI_API_KEY env variable # ── Basic Chat Completion ── response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": "You are a helpful Python tutor."}, {"role": "user", "content": "Explain list comprehensions."}, ], temperature=0.7, max_tokens=500, top_p=0.95, frequency_penalty=0.0, presence_penalty=0.0, seed=42, stop=["\n", "Q:"], # Stop sequences ) print(response.choices[0].message.content) print(f"Usage: prompt={response.usage.prompt_tokens}, " f"completion={response.usage.completion_tokens}") # ── Multi-Turn Conversation ── messages = [ {"role": "system", "content": "You are a data scientist."}, {"role": "user", "content": "I have a CSV with sales data."}, {"role": "assistant", "content": "I can help analyze that. What columns do you have?"}, {"role": "user", "content": "Date, Product, Amount, Region"}, {"role": "assistant", "content": "Here's a pandas analysis template..."}, {"role": "user", "content": "How do I group by Region?"}, ] response = client.chat.completions.create(model="gpt-4o", messages=messages) # ── Streaming Response ── stream = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": "Write a short story."}], stream=True, ) for chunk in stream: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="", flush=True) # ── Async Client ── from openai import AsyncOpenAI async_client = AsyncOpenAI() async def chat_async(prompt): response = await async_client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content

Parameter

Type

Default

Description

model

string

required

Model ID: gpt-4o, gpt-4o-mini, o3-mini

messages

array

required

Array of message objects (system/user/assistant/tool)

temperature

float

1.0

Randomness (0=deterministic, 2=creative)

max_tokens

int

model default

Max tokens in completion

top_p

float

1.0

Nucleus sampling (0.1=focused, 1.0=diverse)

frequency_penalty

float

0.0

-2.0 to 2.0, reduce repetition of frequent tokens

presence_penalty

float

0.0

-2.0 to 2.0, encourage new topics

stop

string/array

null

Stop generation at these sequences

seed

int

null

For deterministic sampling (when possible)

response_format

object

null

{"type": "json_object"} or {"type": "text"}

tools

array

null

Available function definitions for tool calling

int

Number of chat completion choices to generate

import json from openai import OpenAI client = OpenAI() # ── Define Tools ── tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get current weather for a city", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "City name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, "required": ["city"], "additionalProperties": False, }, "strict": True, } }, { "type": "function", "function": { "name": "search_products", "description": "Search product catalog", "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query"}, "category": {"type": "string", "enum": ["electronics", "clothing", "books"]}, "max_price": {"type": "number", "description": "Max price filter"}, }, "required": ["query"], } } } ] # ── Call with Tools ── response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": "What is the weather in Paris?"}], tools=tools, tool_choice="auto", ) # ── Handle Tool Call ── message = response.choices[0].message if message.tool_calls: for tool_call in message.tool_calls: function_name = tool_call.function.name function_args = json.loads(tool_call.function.arguments) print(f"Call: {function_name}({function_args})") # Execute the function if function_name == "get_weather": result = get_weather(function_args["city"]) else: result = "Function not found" # Send result back to model response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "user", "content": "What is the weather in Paris?"}, message, # Assistant message with tool_calls { "role": "tool", "tool_call_id": tool_call.id, "content": str(result), } ], ) print(response.choices[0].message.content)

Pattern

tool_choice

Description

Use Case

Auto

"auto"

Model decides whether to call a tool

General use, uncertain if tool needed

Required

"required"

Model MUST call at least one tool

Data extraction, forced function use

None

"none"

Model will NOT call any tool

Pure conversation, no tool use

Specific

{"type": "function", "function": {"name": "fn"}}

Force a specific tool

When you know which function to use

Parallel

auto + multiple tools

Model calls multiple tools at once

Independent lookups (weather + stock price)

from openai import OpenAI import numpy as np client = OpenAI() # ── Create Embeddings ── response = client.embeddings.create( model="text-embedding-3-small", input=["Hello world", "Hi there", "Goodbye"], encoding_format="float", ) embeddings = [item.embedding for item in response.data] print(f"Dimensions: {len(embeddings[0])}") # 1536 # ── Batch Embeddings (up to 2048 inputs) ── texts = ["Text 1", "Text 2", ..., "Text 2000"] response = client.embeddings.create( model="text-embedding-3-small", input=texts, ) # ── Cosine Similarity ── def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) sim = cosine_similarity(embeddings[0], embeddings[1]) print(f"Similarity: {sim:.4f}") # Close to 1.0 for similar texts # ── Embedding Models Comparison ── # text-embedding-3-small: 1536 dims, $0.02/1M tokens # text-embedding-3-large: 3072 dims, $0.13/1M tokens # text-embedding-ada-002: 1536 dims, $0.10/1M tokens (legacy) # ── Reduced Dimensions (cost/performance tradeoff) ── response = client.embeddings.create( model="text-embedding-3-small", input=["Hello world"], dimensions=512, # Reduce from 1536 to 512 ) # ── Semantic Search Example ── query_embedding = client.embeddings.create( model="text-embedding-3-small", input="machine learning basics" ).data[0].embedding # Compare against document embeddings similarities = [(text, cosine_similarity(query_embedding, doc_emb)) for text, doc_emb in zip(doc_texts, doc_embeddings)] similarities.sort(key=lambda x: x[1], reverse=True) print(f"Top match: {similarities[0][0]} (score: {similarities[0][1]:.4f})")

import json from openai import OpenAI client = OpenAI() # ── Step 1: Prepare Training Data (JSONL) ── training_data = [ {"messages": [ {"role": "system", "content": "You are a medical assistant."}, {"role": "user", "content": "What is hypertension?"}, {"role": "assistant", "content": "Hypertension (high blood pressure) is a condition..."} ]}, # ... 50-10,000+ examples ] with open("medical_train.jsonl", "w") as f: for item in training_data: f.write(json.dumps(item) + "\n") # Validation file (optional, 50-100 examples) with open("medical_val.jsonl", "w") as f: for item in validation_data: f.write(json.dumps(item) + "\n") # ── Step 2: Upload Files ── train_file = client.files.create(file=open("medical_train.jsonl", "rb"), purpose="fine-tune") val_file = client.files.create(file=open("medical_val.jsonl", "rb"), purpose="fine-tune") # ── Step 3: Create Fine-Tuning Job ── job = client.fine_tuning.jobs.create( training_file=train_file.id, model="gpt-4o-mini-2024-07-18", hyperparameters={ "n_epochs": "auto", "batch_size": "auto", "learning_rate_multiplier": "auto", }, validation_file=val_file.id, suffix="medical-assistant", ) print(f"Job ID: {job.id}") # ── Step 4: Monitor ── import time while True: job = client.fine_tuning.jobs.retrieve(job.id) print(f"Status: {job.status}") if job.status in ["succeeded", "failed"]: break time.sleep(60) # ── Step 5: Use Fine-Tuned Model ── response = client.chat.completions.create( model=job.fine_tuned_model, messages=[ {"role": "system", "content": "You are a medical assistant."}, {"role": "user", "content": "What causes diabetes?"}, ], )

from openai import OpenAI client = OpenAI() # ── Create Assistant ── assistant = client.beta.assistants.create( name="Data Analyst", instructions="You analyze data files and create visualizations. " "Use Python to process data and generate charts.", model="gpt-4o", tools=[ {"type": "code_interpreter"}, {"type": "file_search"}, ], ) # ── Upload File for Analysis ── file = client.files.create( file=open("sales_data.csv", "rb"), purpose="assistants", ) # ── Create Thread (conversation) ── thread = client.beta.threads.create() # ── Add Message ── client.beta.threads.messages.create( thread_id=thread.id, role="user", content="Analyze the sales data and show top products by revenue.", attachments=[{"file_id": file.id, "tools": [{"type": "file_search"}]}], ) # ── Run Assistant ── run = client.beta.threads.runs.create( thread_id=thread.id, assistant_id=assistant.id, ) # ── Poll for Completion ── while run.status in ["queued", "in_progress", "requires_action"]: run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id) if run.status == "requires_action": # Handle tool calls (e.g., user confirmation) pass time.sleep(1) # ── Get Messages ── messages = client.beta.threads.messages.list(thread_id=thread.id) for msg in messages.data: print(f"{msg.role}: {msg.content[0].text.value[:200]}")

Component

Purpose

Features

Assistant

AI persona with instructions, model, and tools

Configurable model, system prompt, tools

Thread

Persistent conversation state

Maintains full message history across runs

Message

Individual user/assistant messages

Supports text, images, file attachments

Run

Execution of assistant on a thread

Queued, in_progress, completed, failed

Code Interpreter

Execute Python code in sandbox

File I/O, data analysis, chart generation

File Search

Search uploaded files for context

Automatic chunking, embedding, retrieval

Model

Input (1M tokens)

Output (1M tokens)

Context

Best For

GPT-4o

$2.50

$10.00

128K

General purpose, multimodal

GPT-4o-mini

$0.15

$0.60

128K

High volume, simple tasks

o3-mini

$1.10

$4.40

200K

Reasoning, coding, math

$10.00

$40.00

200K

Deep reasoning, complex problems

GPT-4.1

$2.00

$8.00

Long context, instructions

GPT-4.1-mini

$0.40

$1.60

Fast, long context

text-embedding-3-small

$0.02

Semantic search, clustering

text-embedding-3-large

$0.13

Higher quality embeddings

whisper-1

$0.006/min

Speech-to-text

tts-1

$15.00/1M chars

Text-to-speech

gpt-4o-audio-preview

$2.50

$10.00

128K

Audio in/out

import tiktoken # ── Token Counting ── encoding = tiktoken.encoding_for_model("gpt-4o") def count_tokens(text: str) -> int: return len(encoding.encode(text)) text = "Hello, how are you today?" print(f"Tokens: {count_tokens(text)}") # e.g., 7 # ── Estimate Cost ── def estimate_cost(prompt: str, model: str = "gpt-4o") -> float: """Estimate input cost for a single prompt.""" pricing = { "gpt-4o": 2.50 / 1_000_000, "gpt-4o-mini": 0.15 / 1_000_000, "o3-mini": 1.10 / 1_000_000, } tokens = count_tokens(prompt) return tokens * pricing.get(model, 0.01 / 1000) # ── Caching Strategy ── from functools import lru_cache @lru_cache(maxsize=1000) def cached_completion(prompt_hash: str, model: str) -> str: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt_hash}], ) return response.choices[0].message.content # ── Prompt Compression Tips ── # 1. Remove unnecessary system prompt words # 2. Use shorter examples in few-shot # 3. Truncate context to essential parts # 4. Cache embeddings for repeated queries # 5. Use GPT-4o-mini for simple classification tasks

from openai import OpenAI, RateLimitError, APIError, APIConnectionError import time import logging client = OpenAI() logger = logging.getLogger(__name__) # ── Retry with Exponential Backoff ── def chat_with_retry(messages, model="gpt-4o", max_retries=5): """Call API with automatic retry on rate limit errors.""" for attempt in range(max_retries): try: response = client.chat.completions.create( model=model, messages=messages, timeout=30.0, # Request timeout ) return response.choices[0].message.content except RateLimitError as e: wait = 2 ** attempt + 1 # 1, 3, 7, 15, 31 seconds logger.warning(f"Rate limited. Retrying in {wait}s (attempt {attempt+1})") time.sleep(wait) except APIConnectionError: logger.error("Connection error. Check network.") time.sleep(5) except APIError as e: logger.error(f"API Error: {e}") raise raise Exception(f"Failed after {max_retries} retries") # ── Content Moderation ── def check_safety(text: str) -> dict: """Check text for harmful content.""" result = client.moderations.create(input=text) flagged = result.results[0].flagged categories = result.results[0].categories.model_dump() if flagged: logger.warning(f"Flagged: {categories}") return {"safe": False, "categories": categories} return {"safe": True} # ── Async Batch Processing with Rate Limiting ── import asyncio from asyncio import Semaphore async def process_batch(prompts, model="gpt-4o", concurrency=10): """Process multiple prompts with concurrency control.""" semaphore = Semaphore(concurrency) async_client = AsyncOpenAI() async def process_one(prompt): async with semaphore: response = await async_client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], ) return response.choices[0].message.content tasks = [process_one(p) for p in prompts] return await asyncio.gather(*tasks)

Error

HTTP Code

Cause

Solution

RateLimitError

429

Too many requests

Exponential backoff, reduce concurrency

AuthenticationError

401

Invalid API key

Check OPENAI_API_KEY env variable

BadRequestError

400

Invalid request format

Check message format, model name

NotFoundError

404

Model/resource not found

Check model ID, file ID

TimeoutError

Request exceeded timeout

Increase timeout, use streaming

APIConnectionError

Network issue

Check internet, retry with backoff

ContextLengthExceeded

400

Input too long

Truncate messages, use longer context model