Fine-tuning allows you to customize language models for specific tasks, domains, or styles. While powerful foundation models work well out-of-the-box, fine-tuning can dramatically improve performance for specialized applications. This guide covers practical fine-tuning techniques for 2026.
When to Fine-Tune
- Domain specialization: Medical, legal, or technical domains with specific terminology
- Style consistency: Matching brand voice or specific output formats
- Task optimization: Structured outputs, classification, or extraction tasks
- Performance improvement: When prompting alone doesn't achieve required accuracy
Fine-Tuning vs RAG vs Prompting
Try prompting first - Many tasks work well with good prompts
Use RAG - When you need current or specific knowledge
Fine-tune - For specialized behavior, format compliance, or domain expertise
Dataset Preparation
# Dataset preparation for fine-tuning
import json
from typing import List, Dict
from datasets import Dataset
import random
def prepare_training_data(
examples: List[Dict],
format_type: str = "chat"
) -> Dataset:
"""
Prepare data for fine-tuning.
For chat format:
{"messages": [{"role": "system", "content": "..."},
{"role": "user", "content": "..."},
{"role": "assistant", "content": "..."}]}
"""
formatted = []
for example in examples:
if format_type == "chat":
formatted.append({
"messages": [
{"role": "system", "content": example.get("system", "")},
{"role": "user", "content": example["input"]},
{"role": "assistant", "content": example["output"]}
]
})
elif format_type == "completion":
formatted.append({
"prompt": example["input"],
"completion": example["output"]
})
return Dataset.from_list(formatted)
# Quality filtering
def filter_quality(examples: List[Dict]) -> List[Dict]:
"""Remove low-quality examples."""
filtered = []
for ex in examples:
# Length checks
if len(ex["input"]) < 10 or len(ex["output"]) < 10:
continue
if len(ex["output"]) > 4000: # Avoid very long responses
continue
# Quality checks
if ex["output"].strip().endswith("..."): # Incomplete
continue
filtered.append(ex)
return filtered
# Data augmentation
def augment_data(examples: List[Dict]) -> List[Dict]:
"""Augment training data with variations."""
augmented = list(examples)
for ex in examples:
# Paraphrase variations (using another model)
paraphrased = paraphrase_input(ex["input"])
if paraphrased:
augmented.append({
"input": paraphrased,
"output": ex["output"],
"system": ex.get("system", "")
})
return augmented
# Split dataset
def create_splits(data: List[Dict], train_ratio: float = 0.9):
random.shuffle(data)
split_idx = int(len(data) * train_ratio)
return {
"train": data[:split_idx],
"validation": data[split_idx:]
}Fine-Tuning with OpenAI
# OpenAI fine-tuning
from openai import OpenAI
import json
client = OpenAI()
# 1. Upload training file
def upload_training_file(data: list, filename: str) -> str:
# Save as JSONL
with open(filename, 'w') as f:
for item in data:
f.write(json.dumps(item) + '\n')
# Upload to OpenAI
with open(filename, 'rb') as f:
response = client.files.create(
file=f,
purpose='fine-tune'
)
return response.id
# 2. Create fine-tuning job
def create_fine_tuning_job(
training_file_id: str,
validation_file_id: str = None,
model: str = "gpt-4o-mini-2024-07-18",
suffix: str = "my-custom-model"
):
job = client.fine_tuning.jobs.create(
training_file=training_file_id,
validation_file=validation_file_id,
model=model,
suffix=suffix,
hyperparameters={
"n_epochs": 3,
"batch_size": "auto",
"learning_rate_multiplier": "auto"
}
)
return job
# 3. Monitor progress
def monitor_job(job_id: str):
while True:
job = client.fine_tuning.jobs.retrieve(job_id)
print(f"Status: {job.status}")
if job.status == 'succeeded':
print(f"Fine-tuned model: {job.fine_tuned_model}")
return job.fine_tuned_model
elif job.status == 'failed':
print(f"Job failed: {job.error}")
return None
# Print recent events
events = client.fine_tuning.jobs.list_events(job_id, limit=5)
for event in events.data:
print(f" {event.message}")
time.sleep(60)
# 4. Use fine-tuned model
def query_fine_tuned(model_id: str, prompt: str) -> str:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.contentLoRA Fine-Tuning
# LoRA fine-tuning with PEFT and Transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
# Load base model
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
load_in_8bit=True, # Quantization for memory efficiency
)
# Configure LoRA
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16, # Rank
lora_alpha=32, # Alpha scaling
lora_dropout=0.1,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj", # Attention
"gate_proj", "up_proj", "down_proj" # MLP
],
bias="none",
)
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Trainable: ~0.1% of total parameters
# Prepare dataset
def tokenize_function(examples):
# Format as chat
texts = []
for messages in examples["messages"]:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
texts.append(text)
return tokenizer(
texts,
truncation=True,
max_length=2048,
padding="max_length",
)
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names
)
# Training arguments
training_args = TrainingArguments(
output_dir="./lora-model",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_steps=100,
logging_steps=10,
save_steps=500,
evaluation_strategy="steps",
eval_steps=500,
bf16=True,
optim="adamw_torch",
report_to="wandb",
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
),
)
trainer.train()
# Save LoRA weights
model.save_pretrained("./lora-weights")Evaluation
# Evaluation framework
from typing import List, Dict
import numpy as np
class FineTuneEvaluator:
def __init__(self, model, tokenizer, test_data: List[Dict]):
self.model = model
self.tokenizer = tokenizer
self.test_data = test_data
def generate(self, prompt: str) -> str:
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=512,
temperature=0.1,
do_sample=True
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
def evaluate_accuracy(self) -> Dict:
"""For classification tasks."""
correct = 0
total = len(self.test_data)
for item in self.test_data:
response = self.generate(item["input"])
expected = item["output"].strip().lower()
actual = response.strip().lower()
if expected in actual or actual in expected:
correct += 1
return {"accuracy": correct / total}
def evaluate_rouge(self) -> Dict:
"""For generation tasks."""
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
scores = []
for item in self.test_data:
response = self.generate(item["input"])
score = scorer.score(item["output"], response)
scores.append({
'rouge1': score['rouge1'].fmeasure,
'rouge2': score['rouge2'].fmeasure,
'rougeL': score['rougeL'].fmeasure,
})
return {
'rouge1': np.mean([s['rouge1'] for s in scores]),
'rouge2': np.mean([s['rouge2'] for s in scores]),
'rougeL': np.mean([s['rougeL'] for s in scores]),
}
def evaluate_format_compliance(self, expected_format: str) -> Dict:
"""Check if outputs match expected format."""
import re
compliant = 0
for item in self.test_data:
response = self.generate(item["input"])
if expected_format == "json":
try:
json.loads(response)
compliant += 1
except:
pass
elif expected_format == "markdown":
if response.startswith('#') or '**' in response:
compliant += 1
return {"format_compliance": compliant / len(self.test_data)}Best Practices
Fine-Tuning Best Practices
Data Quality:
- Quality > quantity (100 excellent examples > 1000 mediocre)
- Include diverse examples covering edge cases
- Validate data format before training
Training:
- Start with small learning rate and few epochs
- Monitor validation loss to prevent overfitting
- Use LoRA for efficient training
Evaluation:
- Hold out test set never seen during training
- Evaluate on multiple metrics relevant to your task
- A/B test against base model in production
Conclusion
Fine-tuning is powerful but not always necessary. Start with prompting and RAG, then consider fine-tuning for specialized behavior or format compliance. When you do fine-tune, focus on data quality and proper evaluation.
Need help with LLM fine-tuning? Contact Jishu Labs for expert AI consulting and custom model development.
About Sarah Johnson
Sarah Johnson is the CTO at Jishu Labs with deep expertise in AI systems and machine learning infrastructure.