In [1]:
# Direct Preference Optimization
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_bf16_supported())
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
#torch.backends.cudnn.benchmark = True
import bitsandbytes as bnb
print(bnb.__version__)
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)] PyTorch Version: 2.5.1+cu121 True 1 CUDA Version: 12.1 NVIDIA GeForce RTX 4080 Laptop GPU True 0.43.1
In [2]:
from datasets import load_dataset
def format_prompt(example):
"""Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
return {
"prompt": system + user,
"chosen": assistant_chosen,
"rejected": assistant_rejected,
}
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
lambda r:
r["status"] != "tie" and
r["chosen_score"] >= 8 and
not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)
test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]
test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
print(len(dpo_dataset), len(test_dataset))
print (dpo_dataset.column_names, test_dataset.column_names)
5329 593 ['chosen', 'rejected', 'prompt'] ['chosen', 'rejected', 'prompt']
In [3]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-DPO"
run_name="Qwen-0.5B-DPO-argilla-distilabel"
# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Qwen models should have an EOS token
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# Qwen models should have bos_token:
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
tokenizer.bos_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
#print("Pad token:", tokenizer.pad_token)
#print("Pad token ID:", tokenizer.pad_token_id)
from transformers import AutoModelForCausalLM, AutoConfig
# 8-bit quantization configuration for QLoRA
bnb_config = BitsAndBytesConfig(
load_in_8bit=True, # Enable 8-bit quantization
llm_int8_threshold=6.0
)
config = AutoConfig.from_pretrained(model_name)
config.attention_probs_dropout_prob = 0.05 # Dropout in attention layers
config.hidden_dropout_prob = 0.05 # Dropout in feed-forward layers
model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
quantization_config=bnb_config, # Enables 8-bit QLoRA
device_map="auto",
trust_remote_code=True # Required for Qwen models
)
# LoRA Configuration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
# Prepare LoRA Configuration
peft_config = LoraConfig(
lora_alpha=32, # LoRA Scaling
lora_dropout=0.05, # Dropout for LoRA Layers
r=16, # lower rank to avoid instability in low-bit models, e.g. 8
bias="none",
task_type="CAUSAL_LM",
target_modules = ['q_proj', 'o_proj', 'k_proj', 'v_proj'] # Layers to target
)
# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn(
right trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359
In [4]:
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9" ######
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
# Create DPO trainer
dpo_trainer = DPOTrainer(
model,
args=training_arguments,
train_dataset=dpo_dataset,
#eval_dataset=test_dataset,
tokenizer=tokenizer,
)
print(dpo_trainer.model.config)
max_steps is given, it will override any value given in num_train_epochs
Qwen2Config {
"_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"attention_probs_dropout_prob": 0.05,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_dropout_prob": 0.05,
"hidden_size": 896,
"initializer_range": 0.02,
"intermediate_size": 4864,
"max_position_embeddings": 32768,
"max_window_layers": 21,
"model_type": "qwen2",
"num_attention_heads": 14,
"num_hidden_layers": 24,
"num_key_value_heads": 2,
"quantization_config": {
"_load_in_4bit": false,
"_load_in_8bit": true,
"bnb_4bit_compute_dtype": "float32",
"bnb_4bit_quant_storage": "uint8",
"bnb_4bit_quant_type": "fp4",
"bnb_4bit_use_double_quant": false,
"llm_int8_enable_fp32_cpu_offload": false,
"llm_int8_has_fp16_weight": false,
"llm_int8_skip_modules": null,
"llm_int8_threshold": 6.0,
"load_in_4bit": false,
"load_in_8bit": true,
"quant_method": "bitsandbytes"
},
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 32768,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.41.2",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}
In [5]:
# Training!
dpo_trainer.train()
print (torch.cuda.memory_summary())
!nvidia-smi
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
Could not estimate the number of tokens of the input, floating-point operations will not be computed
[ 991/1001 19:04:20 < 11:34, 0.01 it/s, Epoch 2.23/3]
| Step | Training Loss |
|---|---|
| 100 | 0.770900 |
| 200 | 0.666400 |
| 300 | 0.683200 |
| 400 | 0.606000 |
| 500 | 0.587800 |
| 600 | 0.583100 |
| 700 | 0.580600 |
| 800 | 0.488000 |
| 900 | 0.556600 |
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[5], line 2 1 # Training! ----> 2 dpo_trainer.train() 3 print (torch.cuda.memory_summary()) 4 get_ipython().system('nvidia-smi') File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1883 hf_hub_utils.enable_progress_bars() 1884 else: -> 1885 return inner_training_loop( 1886 args=args, 1887 resume_from_checkpoint=resume_from_checkpoint, 1888 trial=trial, 1889 ignore_keys_for_eval=ignore_keys_for_eval, 1890 ) File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control) 2215 with self.accelerator.accumulate(model): -> 2216 tr_loss_step = self.training_step(model, inputs) 2218 if ( 2219 args.logging_nan_inf_filter 2220 and not is_torch_xla_available() 2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 2222 ): 2223 # if loss is nan or inf simply add the average of previous logged losses 2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:3238, in Trainer.training_step(self, model, inputs) 3235 return loss_mb.reduce_mean().detach().to(self.args.device) 3237 with self.compute_loss_context_manager(): -> 3238 loss = self.compute_loss(model, inputs) 3240 del inputs 3241 torch.cuda.empty_cache() File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1257, in DPOTrainer.compute_loss(self, model, inputs, return_outputs) 1254 compute_loss_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext 1256 with compute_loss_context_manager(): -> 1257 loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train") 1259 # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: 1260 loss = loss.to(self.args.device) File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1231, in DPOTrainer.get_batch_loss_metrics(self, model, batch, train_eval) 1228 losses = losses * self.args.rpo_alpha - policy_chosen_logps_avg 1230 prefix = "eval_" if train_eval == "eval" else "" -> 1231 metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().cpu() 1232 metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().cpu() 1233 metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean().cpu() RuntimeError: CUDA error: out of memory CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [8]:
import torch
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================|
| PyTorch CUDA memory summary, device ID 0 |
|---------------------------------------------------------------------------|
| CUDA OOMs: 0 | cudaMalloc retries: 256 |
|===========================================================================|
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
|---------------------------------------------------------------------------|
| Allocated memory | 10602 MiB | 17739 MiB | 409911 GiB | 409901 GiB |
| from large pool | 10441 MiB | 17576 MiB | 404344 GiB | 404333 GiB |
| from small pool | 161 MiB | 232 MiB | 5567 GiB | 5567 GiB |
|---------------------------------------------------------------------------|
| Active memory | 10602 MiB | 17739 MiB | 409911 GiB | 409901 GiB |
| from large pool | 10441 MiB | 17576 MiB | 404344 GiB | 404333 GiB |
| from small pool | 161 MiB | 232 MiB | 5567 GiB | 5567 GiB |
|---------------------------------------------------------------------------|
| Requested memory | 10569 MiB | 17704 MiB | 407517 GiB | 407506 GiB |
| from large pool | 10407 MiB | 17541 MiB | 401958 GiB | 401948 GiB |
| from small pool | 161 MiB | 232 MiB | 5558 GiB | 5558 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory | 26884 MiB | 26890 MiB | 34182 GiB | 34155 GiB |
| from large pool | 26706 MiB | 26714 MiB | 34170 GiB | 34143 GiB |
| from small pool | 178 MiB | 274 MiB | 12 GiB | 11 GiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 2549 MiB | 10545 MiB | 418938 GiB | 418936 GiB |
| from large pool | 2534 MiB | 10482 MiB | 413314 GiB | 413312 GiB |
| from small pool | 14 MiB | 97 MiB | 5624 GiB | 5624 GiB |
|---------------------------------------------------------------------------|
| Allocations | 2561 | 2610 | 77364 K | 77362 K |
| from large pool | 501 | 536 | 27092 K | 27092 K |
| from small pool | 2060 | 2311 | 50271 K | 50269 K |
|---------------------------------------------------------------------------|
| Active allocs | 2561 | 2610 | 77364 K | 77362 K |
| from large pool | 501 | 536 | 27092 K | 27092 K |
| from small pool | 2060 | 2311 | 50271 K | 50269 K |
|---------------------------------------------------------------------------|
| GPU reserved segments | 158 | 204 | 22351 | 22193 |
| from large pool | 69 | 103 | 16193 | 16124 |
| from small pool | 89 | 137 | 6158 | 6069 |
|---------------------------------------------------------------------------|
| Non-releasable allocs | 576 | 751 | 43511 K | 43511 K |
| from large pool | 58 | 139 | 17206 K | 17205 K |
| from small pool | 518 | 636 | 26305 K | 26305 K |
|---------------------------------------------------------------------------|
| Oversize allocations | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize GPU segments | 0 | 0 | 0 | 0 |
|===========================================================================|
Mon Feb 17 19:37:03 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 41C P8 2W / 105W | 11898MiB / 12282MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 7460 C ...\miniconda3\envs\dpo_env\python.exe N/A |
+-----------------------------------------------------------------------------------------+
In [2]:
import torch
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================|
| PyTorch CUDA memory summary, device ID 0 |
|---------------------------------------------------------------------------|
| CUDA OOMs: 0 | cudaMalloc retries: 0 |
|===========================================================================|
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
|---------------------------------------------------------------------------|
| Allocated memory | 0 B | 0 B | 0 B | 0 B |
| from large pool | 0 B | 0 B | 0 B | 0 B |
| from small pool | 0 B | 0 B | 0 B | 0 B |
|---------------------------------------------------------------------------|
| Active memory | 0 B | 0 B | 0 B | 0 B |
| from large pool | 0 B | 0 B | 0 B | 0 B |
| from small pool | 0 B | 0 B | 0 B | 0 B |
|---------------------------------------------------------------------------|
| Requested memory | 0 B | 0 B | 0 B | 0 B |
| from large pool | 0 B | 0 B | 0 B | 0 B |
| from small pool | 0 B | 0 B | 0 B | 0 B |
|---------------------------------------------------------------------------|
| GPU reserved memory | 0 B | 0 B | 0 B | 0 B |
| from large pool | 0 B | 0 B | 0 B | 0 B |
| from small pool | 0 B | 0 B | 0 B | 0 B |
|---------------------------------------------------------------------------|
| Non-releasable memory | 0 B | 0 B | 0 B | 0 B |
| from large pool | 0 B | 0 B | 0 B | 0 B |
| from small pool | 0 B | 0 B | 0 B | 0 B |
|---------------------------------------------------------------------------|
| Allocations | 0 | 0 | 0 | 0 |
| from large pool | 0 | 0 | 0 | 0 |
| from small pool | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Active allocs | 0 | 0 | 0 | 0 |
| from large pool | 0 | 0 | 0 | 0 |
| from small pool | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| GPU reserved segments | 0 | 0 | 0 | 0 |
| from large pool | 0 | 0 | 0 | 0 |
| from small pool | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Non-releasable allocs | 0 | 0 | 0 | 0 |
| from large pool | 0 | 0 | 0 | 0 |
| from small pool | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize allocations | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize GPU segments | 0 | 0 | 0 | 0 |
|===========================================================================|
Mon Feb 17 19:41:43 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 41C P3 20W / 102W | 0MiB / 12282MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9"
latest_checkpoint = "./results_dpo9/checkpoint-900"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [4]:
from datasets import load_dataset
def format_prompt(example):
"""Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
return {
"prompt": system + user,
"chosen": assistant_chosen,
"rejected": assistant_rejected,
}
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
lambda r:
r["status"] != "tie" and
r["chosen_score"] >= 8 and
not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)
test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]
test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
print(len(dpo_dataset), len(test_dataset))
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-SFT"
run_name="Qwen-0.5B-SFT-argilla-distilabel"
# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Qwen models should have an EOS token
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token}) #####
tokenizer.bos_token_id = tokenizer.eos_token_id #######
tokenizer.padding_side = "right"
print("Pad token:", tokenizer.pad_token)
print("Pad token ID:", tokenizer.pad_token_id)
def tokenize_function(examples):
return tokenizer(
examples["prompt"],
examples["chosen"],
examples["rejected"],
truncation=True,
padding="max_length",
return_tensors="pt"
)
#tokenized_dataset = dpo_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print(len(tokenized_test_dataset))
print(tokenized_test_dataset)
5329 593
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
right
Pad token: <|endoftext|>
Pad token ID: 151643
593
Dataset({
features: ['chosen', 'rejected', 'prompt', 'input_ids', 'attention_mask', 'labels'],
num_rows: 593
})
In [5]:
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
def tokenize_function(examples):
chosen = tokenizer(
examples["chosen"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt"
)
rejected = tokenizer(
examples["rejected"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt"
)
return {
"chosen_input_ids": chosen["input_ids"].squeeze(),
"chosen_attention_mask": chosen["attention_mask"].squeeze(),
"rejected_input_ids": rejected["input_ids"].squeeze(),
"rejected_attention_mask": rejected["attention_mask"].squeeze()
}
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
def add_labels(example):
# Create labels for chosen and rejected sequences, masking padding tokens (-100)
chosen_labels = example["chosen_input_ids"].copy()
rejected_labels = example["rejected_input_ids"].copy()
# Mask padding tokens with -100 so they are ignored in loss calculation
chosen_labels = [-100 if token == tokenizer.pad_token_id else token for token in chosen_labels]
rejected_labels = [-100 if token == tokenizer.pad_token_id else token for token in rejected_labels]
example["chosen_labels"] = chosen_labels
example["rejected_labels"] = rejected_labels
return example
tokenized_test_dataset = tokenized_test_dataset.map(add_labels, batched=False)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
max_steps is given, it will override any value given in num_train_epochs
Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask']) Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])
In [6]:
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
[75/75 09:51]
{'eval_loss': 0.5386355519294739, 'eval_runtime': 601.6035, 'eval_samples_per_second': 0.986, 'eval_steps_per_second': 0.125, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}
In [8]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================|
| PyTorch CUDA memory summary, device ID 0 |
|---------------------------------------------------------------------------|
| CUDA OOMs: 0 | cudaMalloc retries: 0 |
|===========================================================================|
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
|---------------------------------------------------------------------------|
| Allocated memory | 2284 MiB | 21225 MiB | 5609 GiB | 5607 GiB |
| from large pool | 2254 MiB | 21195 MiB | 5589 GiB | 5587 GiB |
| from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB |
|---------------------------------------------------------------------------|
| Active memory | 2284 MiB | 21225 MiB | 5609 GiB | 5607 GiB |
| from large pool | 2254 MiB | 21195 MiB | 5589 GiB | 5587 GiB |
| from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB |
|---------------------------------------------------------------------------|
| Requested memory | 2284 MiB | 21224 MiB | 5608 GiB | 5606 GiB |
| from large pool | 2254 MiB | 21194 MiB | 5588 GiB | 5586 GiB |
| from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory | 3006 MiB | 24328 MiB | 24330 MiB | 21324 MiB |
| from large pool | 2976 MiB | 24290 MiB | 24290 MiB | 21314 MiB |
| from small pool | 30 MiB | 38 MiB | 40 MiB | 10 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 739281 KiB | 5456 MiB | 3236 GiB | 3235 GiB |
| from large pool | 738804 KiB | 5456 MiB | 3198 GiB | 3197 GiB |
| from small pool | 476 KiB | 5 MiB | 38 GiB | 38 GiB |
|---------------------------------------------------------------------------|
| Allocations | 555 | 774 | 278769 | 278214 |
| from large pool | 170 | 182 | 180537 | 180367 |
| from small pool | 385 | 597 | 98232 | 97847 |
|---------------------------------------------------------------------------|
| Active allocs | 555 | 774 | 278769 | 278214 |
| from large pool | 170 | 182 | 180537 | 180367 |
| from small pool | 385 | 597 | 98232 | 97847 |
|---------------------------------------------------------------------------|
| GPU reserved segments | 121 | 130 | 131 | 10 |
| from large pool | 106 | 111 | 111 | 5 |
| from small pool | 15 | 19 | 20 | 5 |
|---------------------------------------------------------------------------|
| Non-releasable allocs | 110 | 172 | 136235 | 136125 |
| from large pool | 107 | 111 | 90490 | 90383 |
| from small pool | 3 | 63 | 45745 | 45742 |
|---------------------------------------------------------------------------|
| Oversize allocations | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize GPU segments | 0 | 0 | 0 | 0 |
|===========================================================================|
Mon Feb 17 19:53:55 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 48C P8 7W / 100W | 3210MiB / 12282MiB | 2% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A |
+-----------------------------------------------------------------------------------------+
In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo7"
latest_checkpoint = "./results_dpo7/checkpoint-1000"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4, ##########
gradient_accumulation_steps=2, ##########
optim="paged_adamw_32bit",
learning_rate=7e-6,
lr_scheduler_type="cosine",
max_steps=2001,
logging_steps=50,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.03,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [10]:
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
# learning_rate=7e-6 after 1000 steps
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
max_steps is given, it will override any value given in num_train_epochs
Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])
[75/75 12:08]
{'eval_loss': 0.54181969165802, 'eval_runtime': 740.6239, 'eval_samples_per_second': 0.801, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': -0.2442632019519806, 'eval_rewards/rejected': -1.1008610725402832, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.8565980195999146, 'eval_logps/rejected': -399.9137878417969, 'eval_logps/chosen': -368.42236328125, 'eval_logits/rejected': -0.5519282817840576, 'eval_logits/chosen': -0.2777632772922516}
In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================|
| PyTorch CUDA memory summary, device ID 0 |
|---------------------------------------------------------------------------|
| CUDA OOMs: 0 | cudaMalloc retries: 0 |
|===========================================================================|
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
|---------------------------------------------------------------------------|
| Allocated memory | 2306 MiB | 23523 MiB | 11222 GiB | 11220 GiB |
| from large pool | 2277 MiB | 23464 MiB | 11182 GiB | 11180 GiB |
| from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB |
|---------------------------------------------------------------------------|
| Active memory | 2306 MiB | 23523 MiB | 11222 GiB | 11220 GiB |
| from large pool | 2277 MiB | 23464 MiB | 11182 GiB | 11180 GiB |
| from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB |
|---------------------------------------------------------------------------|
| Requested memory | 2284 MiB | 23500 MiB | 11217 GiB | 11214 GiB |
| from large pool | 2254 MiB | 23440 MiB | 11177 GiB | 11175 GiB |
| from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory | 2874 MiB | 26716 MiB | 48040 MiB | 45166 MiB |
| from large pool | 2838 MiB | 26648 MiB | 47962 MiB | 45124 MiB |
| from small pool | 36 MiB | 68 MiB | 78 MiB | 42 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 580753 KiB | 5456 MiB | 6550 GiB | 6549 GiB |
| from large pool | 574132 KiB | 5456 MiB | 6490 GiB | 6490 GiB |
| from small pool | 6620 KiB | 18 MiB | 59 GiB | 59 GiB |
|---------------------------------------------------------------------------|
| Allocations | 555 | 1328 | 557537 | 556982 |
| from large pool | 170 | 351 | 361073 | 360903 |
| from small pool | 385 | 982 | 196464 | 196079 |
|---------------------------------------------------------------------------|
| Active allocs | 555 | 1328 | 557537 | 556982 |
| from large pool | 170 | 351 | 361073 | 360903 |
| from small pool | 385 | 982 | 196464 | 196079 |
|---------------------------------------------------------------------------|
| GPU reserved segments | 114 | 217 | 227 | 113 |
| from large pool | 96 | 183 | 188 | 92 |
| from small pool | 18 | 34 | 39 | 21 |
|---------------------------------------------------------------------------|
| Non-releasable allocs | 104 | 223 | 263603 | 263499 |
| from large pool | 96 | 157 | 174967 | 174871 |
| from small pool | 8 | 68 | 88636 | 88628 |
|---------------------------------------------------------------------------|
| Oversize allocations | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize GPU segments | 0 | 0 | 0 | 0 |
|===========================================================================|
Mon Feb 17 20:06:56 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 55C P3 31W / 81W | 3157MiB / 12282MiB | 1% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 4184 C ...les\LibreOffice\program\soffice.bin N/A |
| 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A |
+-----------------------------------------------------------------------------------------+
In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9"
latest_checkpoint = "./results_dpo9/checkpoint-900"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [14]:
# learning_rate=8e-6 after 900 steps BUT: gradient_accumulation_steps=3
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
max_steps is given, it will override any value given in num_train_epochs
[75/75 12:10]
{'eval_loss': 0.5386355519294739, 'eval_runtime': 742.3444, 'eval_samples_per_second': 0.799, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}
In [15]:
# learning_rate=8e-6 after 800 steps BUT: gradient_accumulation_steps=3
latest_checkpoint = "./results_dpo9/checkpoint-800"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
print (torch.cuda.memory_summary())
!nvidia-smi
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\accelerate\utils\modeling.py:1384: UserWarning: Current model requires 402656256 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True. warnings.warn( Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. max_steps is given, it will override any value given in num_train_epochs
[75/75 12:08]
{'eval_loss': 0.5402924418449402, 'eval_runtime': 738.7521, 'eval_samples_per_second': 0.803, 'eval_steps_per_second': 0.102, 'eval_rewards/chosen': -0.04961897060275078, 'eval_rewards/rejected': -0.7467493414878845, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.6971304416656494, 'eval_logps/rejected': -399.20556640625, 'eval_logps/chosen': -368.0331115722656, 'eval_logits/rejected': -0.5474064350128174, 'eval_logits/chosen': -0.2720312774181366}
|===========================================================================|
| PyTorch CUDA memory summary, device ID 0 |
|---------------------------------------------------------------------------|
| CUDA OOMs: 0 | cudaMalloc retries: 0 |
|===========================================================================|
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
|---------------------------------------------------------------------------|
| Allocated memory | 4612 MiB | 23553 MiB | 22463 GiB | 22458 GiB |
| from large pool | 4553 MiB | 23494 MiB | 22384 GiB | 22379 GiB |
| from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB |
|---------------------------------------------------------------------------|
| Active memory | 4612 MiB | 23553 MiB | 22463 GiB | 22458 GiB |
| from large pool | 4553 MiB | 23494 MiB | 22384 GiB | 22379 GiB |
| from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB |
|---------------------------------------------------------------------------|
| Requested memory | 4559 MiB | 23500 MiB | 22433 GiB | 22429 GiB |
| from large pool | 4500 MiB | 23440 MiB | 22354 GiB | 22350 GiB |
| from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory | 26656 MiB | 26716 MiB | 71822 MiB | 45166 MiB |
| from large pool | 26588 MiB | 26648 MiB | 71712 MiB | 45124 MiB |
| from small pool | 68 MiB | 68 MiB | 110 MiB | 42 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 206114 KiB | 5456 MiB | 13183 GiB | 13183 GiB |
| from large pool | 205161 KiB | 5456 MiB | 13079 GiB | 13078 GiB |
| from small pool | 953 KiB | 19 MiB | 104 GiB | 104 GiB |
|---------------------------------------------------------------------------|
| Allocations | 1109 | 1328 | 1115 K | 1113 K |
| from large pool | 339 | 351 | 722 K | 721 K |
| from small pool | 770 | 982 | 392 K | 392 K |
|---------------------------------------------------------------------------|
| Active allocs | 1109 | 1328 | 1115 K | 1113 K |
| from large pool | 339 | 351 | 722 K | 721 K |
| from small pool | 770 | 982 | 392 K | 392 K |
|---------------------------------------------------------------------------|
| GPU reserved segments | 214 | 217 | 327 | 113 |
| from large pool | 180 | 183 | 272 | 92 |
| from small pool | 34 | 34 | 55 | 21 |
|---------------------------------------------------------------------------|
| Non-releasable allocs | 126 | 223 | 525783 | 525657 |
| from large pool | 124 | 157 | 351861 | 351737 |
| from small pool | 2 | 68 | 173922 | 173920 |
|---------------------------------------------------------------------------|
| Oversize allocations | 0 | 0 | 0 | 0 |
|---------------------------------------------------------------------------|
| Oversize GPU segments | 0 | 0 | 0 | 0 |
|===========================================================================|
Mon Feb 17 20:37:53 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 56C P0 41W / 85W | 11726MiB / 12282MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A |
+-----------------------------------------------------------------------------------------+
In [ ]: