OpenAI, Azure, Bedrock, Google, RAG & Caching
# Simple, readable API interactions
from openai import OpenAI
client = OpenAI(api_key="your-key")
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello AI!"}]
)
print(response.choices[0].message.content)Python advantages: - Rich ecosystem of AI libraries - Easy API integration - Excellent data handling capabilities - Strong community support
import openai
from openai import OpenAI
import os
# Secure API key management
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)
def generate_text(prompt: str, model: str = "gpt-3.5-turbo") -> str:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=1000,
temperature=0.7 # Controls creativity (0.0-1.0)
)
return response.choices[0].message.contentKey parameters: - model: Choose GPT-3.5-turbo (fast) or GPT-4 (better) - temperature: 0.0 = consistent, 1.0 = creative - max_tokens: Limit response length
def stream_response(prompt: str):
"""Stream AI responses word by word"""
stream = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")When to use streaming: - Long responses (stories, explanations) - Interactive chat applications - Better user experience perception
# System messages for behavior control
messages = [
{"role": "system", "content": "You are a helpful Python tutor"},
{"role": "user", "content": "Explain loops"}
]
# Function calling (tools)
functions = [{
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
}]
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
functions=functions
)Why Azure OpenAI? - Data residency & compliance - Private endpoints - Enterprise security - Same models, better governance
# OpenAI: Use model names
response = openai_client.chat.completions.create(
model="gpt-3.5-turbo", # Model name
messages=[...]
)
# Azure: Use deployment names
response = azure_client.chat.completions.create(
model="my-gpt-35-deployment", # Your deployment name
messages=[...]
)Azure setup steps: 1. Create Azure OpenAI resource 2. Deploy models to endpoints 3. Use deployment names, not model names
Access multiple AI providers: - Anthropic Claude (reasoning) - AI21 Jurassic (creativity) - Cohere Command (enterprise) - Meta Llama (open source)
def bedrock_generate(prompt: str, model_id: str):
# Claude format
if "anthropic.claude" in model_id:
body = json.dumps({
"prompt": f"\n\nHuman: {prompt}\n\nAssistant:",
"max_tokens_to_sample": 1000
})
# AI21 format
elif "ai21.j2" in model_id:
body = json.dumps({
"prompt": prompt,
"maxTokens": 1000
})
response = bedrock.invoke_model(
modelId=model_id,
body=body
)
return json.loads(response['body'].read())Each model has different: - Input/output formats - Strengths and use cases - Pricing models
Google AI Studio (Gemini API): - Quick experimentation - Simple API access - Free tier available
Vertex AI: - Production workloads - Enterprise features - Advanced ML ops
import google.generativeai as genai
genai.configure(api_key=os.getenv("GOOGLE_AI_API_KEY"))
def google_generate(prompt: str):
model = genai.GenerativeModel("gemini-pro")
response = model.generate_content(prompt)
return response.text
# Multimodal capabilities
model = genai.GenerativeModel("gemini-pro-vision")
response = model.generate_content([
"What's in this image?",
image_data
])Gemini advantages: - Large context window (1M+ tokens) - Multimodal (text + images) - Fast inference speeds
# Basic AI - Limited knowledge
response = client.chat.completions.create(
messages=[{"role": "user", "content": "What's our company policy?"}]
)
# AI doesn't know your specific company policies!Limitations: - Training data cutoff dates - No access to private/recent data - Generic responses only
graph TD
A[User Query] --> B[Document Retrieval]
B --> C[Relevant Docs Found]
C --> D[Enhanced Prompt]
D --> E[AI Generation]
E --> F[Contextual Response]RAG = Retrieval + Augmentation + Generation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class SimpleRAG:
def __init__(self):
self.documents = []
self.vectorizer = TfidfVectorizer()
def add_documents(self, docs):
self.documents = docs
self.vectors = self.vectorizer.fit_transform(docs)
def retrieve(self, query, top_k=3):
query_vec = self.vectorizer.transform([query])
similarities = cosine_similarity(query_vec, self.vectors)
top_indices = similarities.argsort()[0][-top_k:][::-1]
return [self.documents[i] for i in top_indices]
def generate_response(self, query):
context = "\n".join(self.retrieve(query))
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
return generate_text(prompt)import chromadb
from sentence_transformers import SentenceTransformer
class AdvancedRAG:
def __init__(self):
self.client = chromadb.Client()
self.collection = self.client.create_collection("docs")
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
def add_documents(self, documents):
embeddings = self.encoder.encode(documents)
self.collection.add(
embeddings=embeddings.tolist(),
documents=documents,
ids=[f"doc_{i}" for i in range(len(documents))]
)
def semantic_search(self, query, n_results=3):
query_embedding = self.encoder.encode([query])
results = self.collection.query(
query_embeddings=query_embedding.tolist(),
n_results=n_results
)
return results['documents'][0]Vector database benefits: - Better semantic understanding - Faster similarity search - Metadata filtering capabilities
Problems without caching: - Repeated API calls for same questions - High latency (network round-trips) - Expensive API costs - Poor user experience
Benefits of caching: - 90%+ faster repeated responses - Significant cost savings - Better reliability
import time
from typing import Dict, Optional
class ResponseCache:
def __init__(self, ttl_seconds=3600):
self.cache: Dict[str, dict] = {}
self.ttl = ttl_seconds
def get(self, key: str) -> Optional[str]:
if key in self.cache:
entry = self.cache[key]
if time.time() - entry['timestamp'] < self.ttl:
return entry['response']
del self.cache[key] # Remove expired
return None
def set(self, key: str, response: str):
self.cache[key] = {
'response': response,
'timestamp': time.time()
}
cache = ResponseCache(ttl_seconds=1800) # 30 minutesimport sqlite3
import hashlib
from datetime import datetime
class PersistentCache:
def __init__(self, db_path="ai_cache.db"):
self.db_path = db_path
self.init_db()
def init_db(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS responses (
prompt_hash TEXT PRIMARY KEY,
response TEXT,
created_at TIMESTAMP,
model TEXT
)
""")
conn.commit()
conn.close()
def get(self, prompt: str, model: str):
prompt_hash = hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest()
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute(
"SELECT response FROM responses WHERE prompt_hash = ?",
(prompt_hash,)
)
result = cursor.fetchone()
conn.close()
return result[0] if result else Noneimport time
import random
def robust_ai_call(prompt, max_retries=3):
for attempt in range(max_retries):
try:
return generate_text(prompt)
except Exception as e:
if attempt == max_retries - 1:
raise e
# Exponential backoff with jitter
delay = (2 ** attempt) + random.uniform(0, 1)
time.sleep(delay)Error handling strategies: - Exponential backoff for rate limits - Circuit breakers for service failures - Graceful degradation with fallbacks
from collections import deque
import time
class RateLimiter:
def __init__(self, max_calls=60, window_seconds=60):
self.max_calls = max_calls
self.window = window_seconds
self.calls = deque()
def acquire(self):
now = time.time()
# Remove old calls
while self.calls and self.calls[0] <= now - self.window:
self.calls.popleft()
if len(self.calls) < self.max_calls:
self.calls.append(now)
return True
return False
def wait_if_needed(self):
while not self.acquire():
time.sleep(0.1)class MultiProviderAI:
def __init__(self):
self.providers = [
("openai", self.openai_generate),
("azure", self.azure_generate),
("bedrock", self.bedrock_generate),
]
def generate_with_fallback(self, prompt):
for name, provider in self.providers:
try:
result = provider(prompt)
if result:
return result
except Exception as e:
print(f"{name} failed: {e}")
continue
raise Exception("All providers failed")class DocumentQASystem:
def __init__(self):
self.rag = AdvancedRAG()
self.cache = PersistentCache()
self.rate_limiter = RateLimiter()
def load_documents(self, file_path):
with open(file_path, 'r') as f:
content = f.read()
# Split into chunks
chunks = content.split('\n\n')
self.rag.add_documents(chunks)
def ask_question(self, question):
# Check cache first
cached = self.cache.get(question, "gpt-3.5-turbo")
if cached:
return cached
# Rate limiting
self.rate_limiter.wait_if_needed()
# Generate with RAG
response = self.rag.generate_response(question)
# Cache result
self.cache.set(question, response, "gpt-3.5-turbo")
return responseclass CodeAssistant:
def __init__(self):
self.system_prompt = """
You are an expert Python programmer.
Provide clear, working code examples.
Explain complex concepts simply.
"""
def get_code_help(self, question):
messages = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": question}
]
response = client.chat.completions.create(
model="gpt-4",
messages=messages,
temperature=0.3 # Lower for more consistent code
)
return response.choices[0].message.contentMultiple AI Providers: - OpenAI: Best general performance - Azure: Enterprise features - Bedrock: Multi-model access - Google: Multimodal capabilities
Essential Techniques: - RAG for domain knowledge - Caching for performance - Error handling for reliability - Rate limiting for stability
✅ Security: API keys in environment variables ✅ Caching: Implement persistent cache ✅ Monitoring: Log usage and errors ✅ Rate Limiting: Respect API limits ✅ Fallbacks: Multiple provider support ✅ Testing: Comprehensive error scenarios
Remember: Start simple, then add complexity as needed!
Q: Which AI provider should I choose? A: Depends on your needs: - OpenAI: Best for general use - Azure: Enterprise requirements - Bedrock: Want multiple models - Google: Multimodal applications
Q: How much does caching help? A: Typically 80-95% cost reduction for repeated queries
Q: Is RAG always necessary? A: Use RAG when you need: - Current information - Private/proprietary data - Domain-specific knowledge
Resources: - OpenAI API Documentation - Azure OpenAI Service Guide - AWS Bedrock Developer Guide - Google AI Studio Documentation
Happy AI coding! 🤖✨
Python Tutorial