# Direct Preference Optimization
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(torch.cuda.get_device_name(0))

print(torch.cuda.is_bf16_supported())

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
#torch.backends.cudnn.benchmark = True

import bitsandbytes as bnb
print(bnb.__version__)

3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]
PyTorch Version: 2.5.1+cu121
True
1
CUDA Version: 12.1
NVIDIA GeForce RTX 4080 Laptop GPU
True
0.43.1

from datasets import load_dataset

def format_prompt(example):
    """Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
    
    system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
    user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
    assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
    assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
    
    return {
        "prompt": system + user,
        "chosen": assistant_chosen,
        "rejected": assistant_rejected,
    }


dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")

dpo_dataset = dpo_dataset.filter(
    lambda r:
        r["status"] != "tie" and
        r["chosen_score"] >= 8 and
        not r["in_gsm8k_train"]
)

dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)

test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]

test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)

print(len(dpo_dataset), len(test_dataset))
print (dpo_dataset.column_names, test_dataset.column_names)

5329 593
['chosen', 'rejected', 'prompt'] ['chosen', 'rejected', 'prompt']

from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-DPO"
run_name="Qwen-0.5B-DPO-argilla-distilabel"

# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Qwen models should have an EOS token
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# Qwen models should have bos_token:
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
tokenizer.bos_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
#print("Pad token:", tokenizer.pad_token)
#print("Pad token ID:", tokenizer.pad_token_id)

from transformers import AutoModelForCausalLM, AutoConfig

# 8-bit quantization configuration for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0
)

config = AutoConfig.from_pretrained(model_name)
config.attention_probs_dropout_prob = 0.05  # Dropout in attention layers
config.hidden_dropout_prob = 0.05  # Dropout in feed-forward layers

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,  
    quantization_config=bnb_config,  # Enables 8-bit QLoRA
    device_map="auto", 
    trust_remote_code=True  # Required for Qwen models
)

# LoRA Configuration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.05,  # Dropout for LoRA Layers
    r=16,  # lower rank to avoid instability in low-bit models, e.g. 8
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ['q_proj', 'o_proj', 'k_proj', 'v_proj'] # Layers to target
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

right
trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359

from trl import DPOConfig, DPOTrainer

output_dir = "./results_dpo9" ######

# Training arguments
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=3, ##########
    optim="paged_adamw_32bit",
    learning_rate=8e-6, ########
    lr_scheduler_type="cosine",
    max_steps=1001, 
    save_steps=100, 
    logging_steps=100,
    #eval_strategy="steps",
    #eval_steps=200,
    bf16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.05, 
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss",
    max_prompt_length=512,
    max_length=512,
    beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
    remove_unused_columns=False,    
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_arguments,
    train_dataset=dpo_dataset,
    #eval_dataset=test_dataset,
    tokenizer=tokenizer,    
)

print(dpo_trainer.model.config)

max_steps is given, it will override any value given in num_train_epochs

Qwen2Config {
  "_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.05,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_dropout_prob": 0.05,
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "quantization_config": {
    "_load_in_4bit": false,
    "_load_in_8bit": true,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": false,
    "load_in_8bit": true,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

# Training!
dpo_trainer.train()
print (torch.cuda.memory_summary())
!nvidia-smi

C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
Could not estimate the number of tokens of the input, floating-point operations will not be computed

C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[5], line 2
      1 # Training!
----> 2 dpo_trainer.train()
      3 print (torch.cuda.memory_summary())
      4 get_ipython().system('nvidia-smi')

File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1883         hf_hub_utils.enable_progress_bars()
   1884 else:
-> 1885     return inner_training_loop(
   1886         args=args,
   1887         resume_from_checkpoint=resume_from_checkpoint,
   1888         trial=trial,
   1889         ignore_keys_for_eval=ignore_keys_for_eval,
   1890     )

File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2213     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
   2215 with self.accelerator.accumulate(model):
-> 2216     tr_loss_step = self.training_step(model, inputs)
   2218 if (
   2219     args.logging_nan_inf_filter
   2220     and not is_torch_xla_available()
   2221     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   2222 ):
   2223     # if loss is nan or inf simply add the average of previous logged losses
   2224     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:3238, in Trainer.training_step(self, model, inputs)
   3235     return loss_mb.reduce_mean().detach().to(self.args.device)
   3237 with self.compute_loss_context_manager():
-> 3238     loss = self.compute_loss(model, inputs)
   3240 del inputs
   3241 torch.cuda.empty_cache()

File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1257, in DPOTrainer.compute_loss(self, model, inputs, return_outputs)
   1254 compute_loss_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext
   1256 with compute_loss_context_manager():
-> 1257     loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
   1259 # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
   1260 loss = loss.to(self.args.device)

File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1231, in DPOTrainer.get_batch_loss_metrics(self, model, batch, train_eval)
   1228     losses = losses * self.args.rpo_alpha - policy_chosen_logps_avg
   1230 prefix = "eval_" if train_eval == "eval" else ""
-> 1231 metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().cpu()
   1232 metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().cpu()
   1233 metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean().cpu()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

import torch 
print (torch.cuda.memory_summary())
!nvidia-smi

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 256       |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  10602 MiB |  17739 MiB | 409911 GiB | 409901 GiB |
|       from large pool |  10441 MiB |  17576 MiB | 404344 GiB | 404333 GiB |
|       from small pool |    161 MiB |    232 MiB |   5567 GiB |   5567 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  10602 MiB |  17739 MiB | 409911 GiB | 409901 GiB |
|       from large pool |  10441 MiB |  17576 MiB | 404344 GiB | 404333 GiB |
|       from small pool |    161 MiB |    232 MiB |   5567 GiB |   5567 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |  10569 MiB |  17704 MiB | 407517 GiB | 407506 GiB |
|       from large pool |  10407 MiB |  17541 MiB | 401958 GiB | 401948 GiB |
|       from small pool |    161 MiB |    232 MiB |   5558 GiB |   5558 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  26884 MiB |  26890 MiB |  34182 GiB |  34155 GiB |
|       from large pool |  26706 MiB |  26714 MiB |  34170 GiB |  34143 GiB |
|       from small pool |    178 MiB |    274 MiB |     12 GiB |     11 GiB |
|---------------------------------------------------------------------------|
| Non-releasable memory |   2549 MiB |  10545 MiB | 418938 GiB | 418936 GiB |
|       from large pool |   2534 MiB |  10482 MiB | 413314 GiB | 413312 GiB |
|       from small pool |     14 MiB |     97 MiB |   5624 GiB |   5624 GiB |
|---------------------------------------------------------------------------|
| Allocations           |    2561    |    2610    |   77364 K  |   77362 K  |
|       from large pool |     501    |     536    |   27092 K  |   27092 K  |
|       from small pool |    2060    |    2311    |   50271 K  |   50269 K  |
|---------------------------------------------------------------------------|
| Active allocs         |    2561    |    2610    |   77364 K  |   77362 K  |
|       from large pool |     501    |     536    |   27092 K  |   27092 K  |
|       from small pool |    2060    |    2311    |   50271 K  |   50269 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     158    |     204    |   22351    |   22193    |
|       from large pool |      69    |     103    |   16193    |   16124    |
|       from small pool |      89    |     137    |    6158    |    6069    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |     576    |     751    |   43511 K  |   43511 K  |
|       from large pool |      58    |     139    |   17206 K  |   17205 K  |
|       from small pool |     518    |     636    |   26305 K  |   26305 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

Mon Feb 17 19:37:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   41C    P8              2W /  105W |   11898MiB /  12282MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A      7460      C   ...\miniconda3\envs\dpo_env\python.exe      N/A      |
+-----------------------------------------------------------------------------------------+

import torch 
print (torch.cuda.memory_summary())
!nvidia-smi

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Requested memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| GPU reserved memory   |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Non-releasable memory |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Allocations           |       0    |       0    |       0    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Active allocs         |       0    |       0    |       0    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| GPU reserved segments |       0    |       0    |       0    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |       0    |       0    |       0    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

Mon Feb 17 19:41:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   41C    P3             20W /  102W |       0MiB /  12282MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer

output_dir = "./results_dpo9"

latest_checkpoint = "./results_dpo9/checkpoint-900"  

model = AutoPeftModelForCausalLM.from_pretrained(
    latest_checkpoint,  
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Training arguments
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=3, ##########
    optim="paged_adamw_32bit",
    learning_rate=8e-6, ########
    lr_scheduler_type="cosine",
    max_steps=1001, 
    save_steps=100, 
    logging_steps=100,
    #eval_strategy="steps",
    #eval_steps=200,
    bf16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.05, 
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss",
    max_prompt_length=512,
    max_length=512,
    beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
    remove_unused_columns=False,    
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

from datasets import load_dataset

def format_prompt(example):
    """Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
    
    system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
    user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
    assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
    assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
    
    return {
        "prompt": system + user,
        "chosen": assistant_chosen,
        "rejected": assistant_rejected,
    }


dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")

dpo_dataset = dpo_dataset.filter(
    lambda r:
        r["status"] != "tie" and
        r["chosen_score"] >= 8 and
        not r["in_gsm8k_train"]
)

dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)

test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]

test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)

print(len(dpo_dataset), len(test_dataset))

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-SFT"
run_name="Qwen-0.5B-SFT-argilla-distilabel"

# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Qwen models should have an EOS token
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "<PAD>"})
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token}) #####
tokenizer.bos_token_id = tokenizer.eos_token_id #######
tokenizer.padding_side = "right"
print("Pad token:", tokenizer.pad_token)
print("Pad token ID:", tokenizer.pad_token_id)

def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        examples["chosen"],
        examples["rejected"],
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

#tokenized_dataset = dpo_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print(len(tokenized_test_dataset))
print(tokenized_test_dataset)

5329 593

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

right
Pad token: <|endoftext|>
Pad token ID: 151643
593
Dataset({
    features: ['chosen', 'rejected', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 593
})

dpo_trainer = DPOTrainer(
    model=model,  # Use the PEFT model directly
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=dpo_dataset,
)

def tokenize_function(examples):
    chosen = tokenizer(
        examples["chosen"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    rejected = tokenizer(
        examples["rejected"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    
    return {
        "chosen_input_ids": chosen["input_ids"].squeeze(),
        "chosen_attention_mask": chosen["attention_mask"].squeeze(),
        "rejected_input_ids": rejected["input_ids"].squeeze(),
        "rejected_attention_mask": rejected["attention_mask"].squeeze()
    }
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())

def add_labels(example):
    # Create labels for chosen and rejected sequences, masking padding tokens (-100)
    chosen_labels = example["chosen_input_ids"].copy()
    rejected_labels = example["rejected_input_ids"].copy()

    # Mask padding tokens with -100 so they are ignored in loss calculation
    chosen_labels = [-100 if token == tokenizer.pad_token_id else token for token in chosen_labels]
    rejected_labels = [-100 if token == tokenizer.pad_token_id else token for token in rejected_labels]

    example["chosen_labels"] = chosen_labels
    example["rejected_labels"] = rejected_labels
    return example

tokenized_test_dataset = tokenized_test_dataset.map(add_labels, batched=False)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())

max_steps is given, it will override any value given in num_train_epochs

Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask'])
Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])

# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset

# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)

{'eval_loss': 0.5386355519294739, 'eval_runtime': 601.6035, 'eval_samples_per_second': 0.986, 'eval_steps_per_second': 0.125, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}

import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2284 MiB |  21225 MiB |   5609 GiB |   5607 GiB |
|       from large pool |   2254 MiB |  21195 MiB |   5589 GiB |   5587 GiB |
|       from small pool |     29 MiB |     37 MiB |     19 GiB |     19 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   2284 MiB |  21225 MiB |   5609 GiB |   5607 GiB |
|       from large pool |   2254 MiB |  21195 MiB |   5589 GiB |   5587 GiB |
|       from small pool |     29 MiB |     37 MiB |     19 GiB |     19 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |   2284 MiB |  21224 MiB |   5608 GiB |   5606 GiB |
|       from large pool |   2254 MiB |  21194 MiB |   5588 GiB |   5586 GiB |
|       from small pool |     29 MiB |     37 MiB |     19 GiB |     19 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   3006 MiB |  24328 MiB |  24330 MiB |  21324 MiB |
|       from large pool |   2976 MiB |  24290 MiB |  24290 MiB |  21314 MiB |
|       from small pool |     30 MiB |     38 MiB |     40 MiB |     10 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 739281 KiB |   5456 MiB |   3236 GiB |   3235 GiB |
|       from large pool | 738804 KiB |   5456 MiB |   3198 GiB |   3197 GiB |
|       from small pool |    476 KiB |      5 MiB |     38 GiB |     38 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     555    |     774    |  278769    |  278214    |
|       from large pool |     170    |     182    |  180537    |  180367    |
|       from small pool |     385    |     597    |   98232    |   97847    |
|---------------------------------------------------------------------------|
| Active allocs         |     555    |     774    |  278769    |  278214    |
|       from large pool |     170    |     182    |  180537    |  180367    |
|       from small pool |     385    |     597    |   98232    |   97847    |
|---------------------------------------------------------------------------|
| GPU reserved segments |     121    |     130    |     131    |      10    |
|       from large pool |     106    |     111    |     111    |       5    |
|       from small pool |      15    |      19    |      20    |       5    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |     110    |     172    |  136235    |  136125    |
|       from large pool |     107    |     111    |   90490    |   90383    |
|       from small pool |       3    |      63    |   45745    |   45742    |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

Mon Feb 17 19:53:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8              7W /  100W |    3210MiB /  12282MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A     11088      C   ...\miniconda3\envs\dpo_env\python.exe      N/A      |
+-----------------------------------------------------------------------------------------+

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer

output_dir = "./results_dpo7"

latest_checkpoint = "./results_dpo7/checkpoint-1000"  

model = AutoPeftModelForCausalLM.from_pretrained(
    latest_checkpoint,  
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Training arguments
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=4, ##########
    gradient_accumulation_steps=2, ##########
    optim="paged_adamw_32bit",
    learning_rate=7e-6,
    lr_scheduler_type="cosine",
    max_steps=2001,
    logging_steps=50,
    #eval_strategy="steps",
    #eval_steps=200,
    bf16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.03,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss",
    max_prompt_length=512,
    max_length=512,
    beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
    remove_unused_columns=False,    
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
# learning_rate=7e-6 after 1000 steps
dpo_trainer = DPOTrainer(
    model=model,  # Use the PEFT model directly
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=dpo_dataset,
)

# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset

# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)

max_steps is given, it will override any value given in num_train_epochs

Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])

{'eval_loss': 0.54181969165802, 'eval_runtime': 740.6239, 'eval_samples_per_second': 0.801, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': -0.2442632019519806, 'eval_rewards/rejected': -1.1008610725402832, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.8565980195999146, 'eval_logps/rejected': -399.9137878417969, 'eval_logps/chosen': -368.42236328125, 'eval_logits/rejected': -0.5519282817840576, 'eval_logits/chosen': -0.2777632772922516}

import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2306 MiB |  23523 MiB |  11222 GiB |  11220 GiB |
|       from large pool |   2277 MiB |  23464 MiB |  11182 GiB |  11180 GiB |
|       from small pool |     29 MiB |     67 MiB |     39 GiB |     39 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   2306 MiB |  23523 MiB |  11222 GiB |  11220 GiB |
|       from large pool |   2277 MiB |  23464 MiB |  11182 GiB |  11180 GiB |
|       from small pool |     29 MiB |     67 MiB |     39 GiB |     39 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |   2284 MiB |  23500 MiB |  11217 GiB |  11214 GiB |
|       from large pool |   2254 MiB |  23440 MiB |  11177 GiB |  11175 GiB |
|       from small pool |     29 MiB |     67 MiB |     39 GiB |     39 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   2874 MiB |  26716 MiB |  48040 MiB |  45166 MiB |
|       from large pool |   2838 MiB |  26648 MiB |  47962 MiB |  45124 MiB |
|       from small pool |     36 MiB |     68 MiB |     78 MiB |     42 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 580753 KiB |   5456 MiB |   6550 GiB |   6549 GiB |
|       from large pool | 574132 KiB |   5456 MiB |   6490 GiB |   6490 GiB |
|       from small pool |   6620 KiB |     18 MiB |     59 GiB |     59 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     555    |    1328    |  557537    |  556982    |
|       from large pool |     170    |     351    |  361073    |  360903    |
|       from small pool |     385    |     982    |  196464    |  196079    |
|---------------------------------------------------------------------------|
| Active allocs         |     555    |    1328    |  557537    |  556982    |
|       from large pool |     170    |     351    |  361073    |  360903    |
|       from small pool |     385    |     982    |  196464    |  196079    |
|---------------------------------------------------------------------------|
| GPU reserved segments |     114    |     217    |     227    |     113    |
|       from large pool |      96    |     183    |     188    |      92    |
|       from small pool |      18    |      34    |      39    |      21    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |     104    |     223    |  263603    |  263499    |
|       from large pool |      96    |     157    |  174967    |  174871    |
|       from small pool |       8    |      68    |   88636    |   88628    |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

Mon Feb 17 20:06:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   55C    P3             31W /   81W |    3157MiB /  12282MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A      4184      C   ...les\LibreOffice\program\soffice.bin      N/A      |
|    0   N/A  N/A     11088      C   ...\miniconda3\envs\dpo_env\python.exe      N/A      |
+-----------------------------------------------------------------------------------------+

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer

output_dir = "./results_dpo9"

latest_checkpoint = "./results_dpo9/checkpoint-900"  

model = AutoPeftModelForCausalLM.from_pretrained(
    latest_checkpoint,  
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Training arguments
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=3, ##########
    optim="paged_adamw_32bit",
    learning_rate=8e-6, ########
    lr_scheduler_type="cosine",
    max_steps=1001, 
    save_steps=100, 
    logging_steps=100,
    #eval_strategy="steps",
    #eval_steps=200,
    bf16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.05, 
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss",
    max_prompt_length=512,
    max_length=512,
    beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
    remove_unused_columns=False,    
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

# learning_rate=8e-6 after 900 steps BUT: gradient_accumulation_steps=3
dpo_trainer = DPOTrainer(
    model=model,  # Use the PEFT model directly
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=dpo_dataset,
)

# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset

# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)

max_steps is given, it will override any value given in num_train_epochs

{'eval_loss': 0.5386355519294739, 'eval_runtime': 742.3444, 'eval_samples_per_second': 0.799, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}

# learning_rate=8e-6 after 800 steps BUT: gradient_accumulation_steps=3
latest_checkpoint = "./results_dpo9/checkpoint-800"  

model = AutoPeftModelForCausalLM.from_pretrained(
    latest_checkpoint,  
    low_cpu_mem_usage=True,
    device_map="auto",
)
dpo_trainer = DPOTrainer(
    model=model,  # Use the PEFT model directly
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=dpo_dataset,
)

# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset

# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)

print (torch.cuda.memory_summary())
!nvidia-smi

C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\accelerate\utils\modeling.py:1384: UserWarning: Current model requires 402656256 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.
  warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_steps is given, it will override any value given in num_train_epochs

{'eval_loss': 0.5402924418449402, 'eval_runtime': 738.7521, 'eval_samples_per_second': 0.803, 'eval_steps_per_second': 0.102, 'eval_rewards/chosen': -0.04961897060275078, 'eval_rewards/rejected': -0.7467493414878845, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.6971304416656494, 'eval_logps/rejected': -399.20556640625, 'eval_logps/chosen': -368.0331115722656, 'eval_logits/rejected': -0.5474064350128174, 'eval_logits/chosen': -0.2720312774181366}
|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   4612 MiB |  23553 MiB |  22463 GiB |  22458 GiB |
|       from large pool |   4553 MiB |  23494 MiB |  22384 GiB |  22379 GiB |
|       from small pool |     59 MiB |     67 MiB |     78 GiB |     78 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   4612 MiB |  23553 MiB |  22463 GiB |  22458 GiB |
|       from large pool |   4553 MiB |  23494 MiB |  22384 GiB |  22379 GiB |
|       from small pool |     59 MiB |     67 MiB |     78 GiB |     78 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |   4559 MiB |  23500 MiB |  22433 GiB |  22429 GiB |
|       from large pool |   4500 MiB |  23440 MiB |  22354 GiB |  22350 GiB |
|       from small pool |     59 MiB |     67 MiB |     78 GiB |     78 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  26656 MiB |  26716 MiB |  71822 MiB |  45166 MiB |
|       from large pool |  26588 MiB |  26648 MiB |  71712 MiB |  45124 MiB |
|       from small pool |     68 MiB |     68 MiB |    110 MiB |     42 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 206114 KiB |   5456 MiB |  13183 GiB |  13183 GiB |
|       from large pool | 205161 KiB |   5456 MiB |  13079 GiB |  13078 GiB |
|       from small pool |    953 KiB |     19 MiB |    104 GiB |    104 GiB |
|---------------------------------------------------------------------------|
| Allocations           |    1109    |    1328    |    1115 K  |    1113 K  |
|       from large pool |     339    |     351    |     722 K  |     721 K  |
|       from small pool |     770    |     982    |     392 K  |     392 K  |
|---------------------------------------------------------------------------|
| Active allocs         |    1109    |    1328    |    1115 K  |    1113 K  |
|       from large pool |     339    |     351    |     722 K  |     721 K  |
|       from small pool |     770    |     982    |     392 K  |     392 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     214    |     217    |     327    |     113    |
|       from large pool |     180    |     183    |     272    |      92    |
|       from small pool |      34    |      34    |      55    |      21    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |     126    |     223    |  525783    |  525657    |
|       from large pool |     124    |     157    |  351861    |  351737    |
|       from small pool |       2    |      68    |  173922    |  173920    |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

Mon Feb 17 20:37:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   56C    P0             41W /   85W |   11726MiB /  12282MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A     11088      C   ...\miniconda3\envs\dpo_env\python.exe      N/A      |
+-----------------------------------------------------------------------------------------+

Step	Training Loss
100	0.770900
200	0.666400
300	0.683200
400	0.606000
500	0.587800
600	0.583100
700	0.580600
800	0.488000
900	0.556600