In [1]:
# Direct Preference Optimization
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_bf16_supported())
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
#torch.backends.cudnn.benchmark = True
import bitsandbytes as bnb
print(bnb.__version__)
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)] PyTorch Version: 2.5.1+cu121 True 1 CUDA Version: 12.1 NVIDIA GeForce RTX 4080 Laptop GPU True 0.43.1
In [2]:
from datasets import load_dataset
def format_prompt(example):
"""Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
return {
"prompt": system + user,
"chosen": assistant_chosen,
"rejected": assistant_rejected,
}
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
lambda r:
r["status"] != "tie" and
r["chosen_score"] >= 8 and
not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)
test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]
test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
print(len(dpo_dataset), len(test_dataset))
print (dpo_dataset.column_names, test_dataset.column_names)
5329 593 ['chosen', 'rejected', 'prompt'] ['chosen', 'rejected', 'prompt']
In [3]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-DPO"
run_name="Qwen-0.5B-DPO-argilla-distilabel"
# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Qwen models should have an EOS token
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# Qwen models should have bos_token:
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
tokenizer.bos_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
#print("Pad token:", tokenizer.pad_token)
#print("Pad token ID:", tokenizer.pad_token_id)
from transformers import AutoModelForCausalLM, AutoConfig
# 8-bit quantization configuration for QLoRA
bnb_config = BitsAndBytesConfig(
load_in_8bit=True, # Enable 8-bit quantization
llm_int8_threshold=6.0
)
config = AutoConfig.from_pretrained(model_name)
config.attention_probs_dropout_prob = 0.05 # Dropout in attention layers
config.hidden_dropout_prob = 0.05 # Dropout in feed-forward layers
model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
quantization_config=bnb_config, # Enables 8-bit QLoRA
device_map="auto",
trust_remote_code=True # Required for Qwen models
)
# LoRA Configuration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
# Prepare LoRA Configuration
peft_config = LoraConfig(
lora_alpha=32, # LoRA Scaling
lora_dropout=0.05, # Dropout for LoRA Layers
r=16, # lower rank to avoid instability in low-bit models, e.g. 8
bias="none",
task_type="CAUSAL_LM",
target_modules = ['q_proj', 'o_proj', 'k_proj', 'v_proj'] # Layers to target
)
# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn(
right trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359
In [4]:
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9" ######
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
# Create DPO trainer
dpo_trainer = DPOTrainer(
model,
args=training_arguments,
train_dataset=dpo_dataset,
#eval_dataset=test_dataset,
tokenizer=tokenizer,
)
print(dpo_trainer.model.config)
max_steps is given, it will override any value given in num_train_epochs
Qwen2Config { "_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "architectures": [ "Qwen2ForCausalLM" ], "attention_dropout": 0.0, "attention_probs_dropout_prob": 0.05, "bos_token_id": 151643, "eos_token_id": 151645, "hidden_act": "silu", "hidden_dropout_prob": 0.05, "hidden_size": 896, "initializer_range": 0.02, "intermediate_size": 4864, "max_position_embeddings": 32768, "max_window_layers": 21, "model_type": "qwen2", "num_attention_heads": 14, "num_hidden_layers": 24, "num_key_value_heads": 2, "quantization_config": { "_load_in_4bit": false, "_load_in_8bit": true, "bnb_4bit_compute_dtype": "float32", "bnb_4bit_quant_storage": "uint8", "bnb_4bit_quant_type": "fp4", "bnb_4bit_use_double_quant": false, "llm_int8_enable_fp32_cpu_offload": false, "llm_int8_has_fp16_weight": false, "llm_int8_skip_modules": null, "llm_int8_threshold": 6.0, "load_in_4bit": false, "load_in_8bit": true, "quant_method": "bitsandbytes" }, "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 32768, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.41.2", "use_cache": true, "use_sliding_window": false, "vocab_size": 151936 }
In [5]:
# Training!
dpo_trainer.train()
print (torch.cuda.memory_summary())
!nvidia-smi
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") Could not estimate the number of tokens of the input, floating-point operations will not be computed
[ 991/1001 19:04:20 < 11:34, 0.01 it/s, Epoch 2.23/3]
Step | Training Loss |
---|---|
100 | 0.770900 |
200 | 0.666400 |
300 | 0.683200 |
400 | 0.606000 |
500 | 0.587800 |
600 | 0.583100 |
700 | 0.580600 |
800 | 0.488000 |
900 | 0.556600 |
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\huggingface_hub\file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\bitsandbytes\autograd\_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[5], line 2 1 # Training! ----> 2 dpo_trainer.train() 3 print (torch.cuda.memory_summary()) 4 get_ipython().system('nvidia-smi') File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1883 hf_hub_utils.enable_progress_bars() 1884 else: -> 1885 return inner_training_loop( 1886 args=args, 1887 resume_from_checkpoint=resume_from_checkpoint, 1888 trial=trial, 1889 ignore_keys_for_eval=ignore_keys_for_eval, 1890 ) File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control) 2215 with self.accelerator.accumulate(model): -> 2216 tr_loss_step = self.training_step(model, inputs) 2218 if ( 2219 args.logging_nan_inf_filter 2220 and not is_torch_xla_available() 2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 2222 ): 2223 # if loss is nan or inf simply add the average of previous logged losses 2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) File ~\miniconda3\envs\dpo_env\lib\site-packages\transformers\trainer.py:3238, in Trainer.training_step(self, model, inputs) 3235 return loss_mb.reduce_mean().detach().to(self.args.device) 3237 with self.compute_loss_context_manager(): -> 3238 loss = self.compute_loss(model, inputs) 3240 del inputs 3241 torch.cuda.empty_cache() File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1257, in DPOTrainer.compute_loss(self, model, inputs, return_outputs) 1254 compute_loss_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext 1256 with compute_loss_context_manager(): -> 1257 loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train") 1259 # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: 1260 loss = loss.to(self.args.device) File ~\miniconda3\envs\dpo_env\lib\site-packages\trl\trainer\dpo_trainer.py:1231, in DPOTrainer.get_batch_loss_metrics(self, model, batch, train_eval) 1228 losses = losses * self.args.rpo_alpha - policy_chosen_logps_avg 1230 prefix = "eval_" if train_eval == "eval" else "" -> 1231 metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().cpu() 1232 metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().cpu() 1233 metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean().cpu() RuntimeError: CUDA error: out of memory CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [8]:
import torch
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 256 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 10602 MiB | 17739 MiB | 409911 GiB | 409901 GiB | | from large pool | 10441 MiB | 17576 MiB | 404344 GiB | 404333 GiB | | from small pool | 161 MiB | 232 MiB | 5567 GiB | 5567 GiB | |---------------------------------------------------------------------------| | Active memory | 10602 MiB | 17739 MiB | 409911 GiB | 409901 GiB | | from large pool | 10441 MiB | 17576 MiB | 404344 GiB | 404333 GiB | | from small pool | 161 MiB | 232 MiB | 5567 GiB | 5567 GiB | |---------------------------------------------------------------------------| | Requested memory | 10569 MiB | 17704 MiB | 407517 GiB | 407506 GiB | | from large pool | 10407 MiB | 17541 MiB | 401958 GiB | 401948 GiB | | from small pool | 161 MiB | 232 MiB | 5558 GiB | 5558 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 26884 MiB | 26890 MiB | 34182 GiB | 34155 GiB | | from large pool | 26706 MiB | 26714 MiB | 34170 GiB | 34143 GiB | | from small pool | 178 MiB | 274 MiB | 12 GiB | 11 GiB | |---------------------------------------------------------------------------| | Non-releasable memory | 2549 MiB | 10545 MiB | 418938 GiB | 418936 GiB | | from large pool | 2534 MiB | 10482 MiB | 413314 GiB | 413312 GiB | | from small pool | 14 MiB | 97 MiB | 5624 GiB | 5624 GiB | |---------------------------------------------------------------------------| | Allocations | 2561 | 2610 | 77364 K | 77362 K | | from large pool | 501 | 536 | 27092 K | 27092 K | | from small pool | 2060 | 2311 | 50271 K | 50269 K | |---------------------------------------------------------------------------| | Active allocs | 2561 | 2610 | 77364 K | 77362 K | | from large pool | 501 | 536 | 27092 K | 27092 K | | from small pool | 2060 | 2311 | 50271 K | 50269 K | |---------------------------------------------------------------------------| | GPU reserved segments | 158 | 204 | 22351 | 22193 | | from large pool | 69 | 103 | 16193 | 16124 | | from small pool | 89 | 137 | 6158 | 6069 | |---------------------------------------------------------------------------| | Non-releasable allocs | 576 | 751 | 43511 K | 43511 K | | from large pool | 58 | 139 | 17206 K | 17205 K | | from small pool | 518 | 636 | 26305 K | 26305 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| Mon Feb 17 19:37:03 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 41C P8 2W / 105W | 11898MiB / 12282MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 7460 C ...\miniconda3\envs\dpo_env\python.exe N/A | +-----------------------------------------------------------------------------------------+
In [2]:
import torch
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 0 B | 0 B | 0 B | 0 B | | from large pool | 0 B | 0 B | 0 B | 0 B | | from small pool | 0 B | 0 B | 0 B | 0 B | |---------------------------------------------------------------------------| | Active memory | 0 B | 0 B | 0 B | 0 B | | from large pool | 0 B | 0 B | 0 B | 0 B | | from small pool | 0 B | 0 B | 0 B | 0 B | |---------------------------------------------------------------------------| | Requested memory | 0 B | 0 B | 0 B | 0 B | | from large pool | 0 B | 0 B | 0 B | 0 B | | from small pool | 0 B | 0 B | 0 B | 0 B | |---------------------------------------------------------------------------| | GPU reserved memory | 0 B | 0 B | 0 B | 0 B | | from large pool | 0 B | 0 B | 0 B | 0 B | | from small pool | 0 B | 0 B | 0 B | 0 B | |---------------------------------------------------------------------------| | Non-releasable memory | 0 B | 0 B | 0 B | 0 B | | from large pool | 0 B | 0 B | 0 B | 0 B | | from small pool | 0 B | 0 B | 0 B | 0 B | |---------------------------------------------------------------------------| | Allocations | 0 | 0 | 0 | 0 | | from large pool | 0 | 0 | 0 | 0 | | from small pool | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Active allocs | 0 | 0 | 0 | 0 | | from large pool | 0 | 0 | 0 | 0 | | from small pool | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | GPU reserved segments | 0 | 0 | 0 | 0 | | from large pool | 0 | 0 | 0 | 0 | | from small pool | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Non-releasable allocs | 0 | 0 | 0 | 0 | | from large pool | 0 | 0 | 0 | 0 | | from small pool | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| Mon Feb 17 19:41:43 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 41C P3 20W / 102W | 0MiB / 12282MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9"
latest_checkpoint = "./results_dpo9/checkpoint-900"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [4]:
from datasets import load_dataset
def format_prompt(example):
"""Format the prompt to Qwen's <|im_start|> and <|im_end|> template."""
system = "<|im_start|>system\n" + example['system'] + "\n<|im_end|>\n"
user = "<|im_start|>user\n" + example['input'] + "\n<|im_end|>\n"
assistant_chosen = "<|im_start|>assistant\n" + example['chosen'] + "\n<|im_end|>\n"
assistant_rejected = "<|im_start|>assistant\n" + example['rejected'] + "\n<|im_end|>\n"
return {
"prompt": system + user,
"chosen": assistant_chosen,
"rejected": assistant_rejected,
}
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
lambda r:
r["status"] != "tie" and
r["chosen_score"] >= 8 and
not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.train_test_split(test_size=0.1, seed=137)
test_dataset = dpo_dataset["test"]
dpo_dataset = dpo_dataset["train"]
test_dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
print(len(dpo_dataset), len(test_dataset))
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen-0.5B-SFT"
run_name="Qwen-0.5B-SFT-argilla-distilabel"
# load tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer.padding_side)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Qwen models should have an EOS token
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token}) #####
tokenizer.bos_token_id = tokenizer.eos_token_id #######
tokenizer.padding_side = "right"
print("Pad token:", tokenizer.pad_token)
print("Pad token ID:", tokenizer.pad_token_id)
def tokenize_function(examples):
return tokenizer(
examples["prompt"],
examples["chosen"],
examples["rejected"],
truncation=True,
padding="max_length",
return_tensors="pt"
)
#tokenized_dataset = dpo_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print(len(tokenized_test_dataset))
print(tokenized_test_dataset)
5329 593
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
right Pad token: <|endoftext|> Pad token ID: 151643 593 Dataset({ features: ['chosen', 'rejected', 'prompt', 'input_ids', 'attention_mask', 'labels'], num_rows: 593 })
In [5]:
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
def tokenize_function(examples):
chosen = tokenizer(
examples["chosen"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt"
)
rejected = tokenizer(
examples["rejected"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt"
)
return {
"chosen_input_ids": chosen["input_ids"].squeeze(),
"chosen_attention_mask": chosen["attention_mask"].squeeze(),
"rejected_input_ids": rejected["input_ids"].squeeze(),
"rejected_attention_mask": rejected["attention_mask"].squeeze()
}
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
def add_labels(example):
# Create labels for chosen and rejected sequences, masking padding tokens (-100)
chosen_labels = example["chosen_input_ids"].copy()
rejected_labels = example["rejected_input_ids"].copy()
# Mask padding tokens with -100 so they are ignored in loss calculation
chosen_labels = [-100 if token == tokenizer.pad_token_id else token for token in chosen_labels]
rejected_labels = [-100 if token == tokenizer.pad_token_id else token for token in rejected_labels]
example["chosen_labels"] = chosen_labels
example["rejected_labels"] = rejected_labels
return example
tokenized_test_dataset = tokenized_test_dataset.map(add_labels, batched=False)
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
max_steps is given, it will override any value given in num_train_epochs
Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask']) Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])
In [6]:
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
[75/75 09:51]
{'eval_loss': 0.5386355519294739, 'eval_runtime': 601.6035, 'eval_samples_per_second': 0.986, 'eval_steps_per_second': 0.125, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}
In [8]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 2284 MiB | 21225 MiB | 5609 GiB | 5607 GiB | | from large pool | 2254 MiB | 21195 MiB | 5589 GiB | 5587 GiB | | from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB | |---------------------------------------------------------------------------| | Active memory | 2284 MiB | 21225 MiB | 5609 GiB | 5607 GiB | | from large pool | 2254 MiB | 21195 MiB | 5589 GiB | 5587 GiB | | from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB | |---------------------------------------------------------------------------| | Requested memory | 2284 MiB | 21224 MiB | 5608 GiB | 5606 GiB | | from large pool | 2254 MiB | 21194 MiB | 5588 GiB | 5586 GiB | | from small pool | 29 MiB | 37 MiB | 19 GiB | 19 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 3006 MiB | 24328 MiB | 24330 MiB | 21324 MiB | | from large pool | 2976 MiB | 24290 MiB | 24290 MiB | 21314 MiB | | from small pool | 30 MiB | 38 MiB | 40 MiB | 10 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 739281 KiB | 5456 MiB | 3236 GiB | 3235 GiB | | from large pool | 738804 KiB | 5456 MiB | 3198 GiB | 3197 GiB | | from small pool | 476 KiB | 5 MiB | 38 GiB | 38 GiB | |---------------------------------------------------------------------------| | Allocations | 555 | 774 | 278769 | 278214 | | from large pool | 170 | 182 | 180537 | 180367 | | from small pool | 385 | 597 | 98232 | 97847 | |---------------------------------------------------------------------------| | Active allocs | 555 | 774 | 278769 | 278214 | | from large pool | 170 | 182 | 180537 | 180367 | | from small pool | 385 | 597 | 98232 | 97847 | |---------------------------------------------------------------------------| | GPU reserved segments | 121 | 130 | 131 | 10 | | from large pool | 106 | 111 | 111 | 5 | | from small pool | 15 | 19 | 20 | 5 | |---------------------------------------------------------------------------| | Non-releasable allocs | 110 | 172 | 136235 | 136125 | | from large pool | 107 | 111 | 90490 | 90383 | | from small pool | 3 | 63 | 45745 | 45742 | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| Mon Feb 17 19:53:55 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 48C P8 7W / 100W | 3210MiB / 12282MiB | 2% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A | +-----------------------------------------------------------------------------------------+
In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo7"
latest_checkpoint = "./results_dpo7/checkpoint-1000"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4, ##########
gradient_accumulation_steps=2, ##########
optim="paged_adamw_32bit",
learning_rate=7e-6,
lr_scheduler_type="cosine",
max_steps=2001,
logging_steps=50,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.03,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [10]:
print("Available keys in tokenized test dataset:", tokenized_test_dataset[0].keys())
# learning_rate=7e-6 after 1000 steps
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
max_steps is given, it will override any value given in num_train_epochs
Available keys in tokenized test dataset: dict_keys(['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask', 'chosen_labels', 'rejected_labels'])
[75/75 12:08]
{'eval_loss': 0.54181969165802, 'eval_runtime': 740.6239, 'eval_samples_per_second': 0.801, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': -0.2442632019519806, 'eval_rewards/rejected': -1.1008610725402832, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.8565980195999146, 'eval_logps/rejected': -399.9137878417969, 'eval_logps/chosen': -368.42236328125, 'eval_logits/rejected': -0.5519282817840576, 'eval_logits/chosen': -0.2777632772922516}
In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print (torch.cuda.memory_summary())
!nvidia-smi
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 2306 MiB | 23523 MiB | 11222 GiB | 11220 GiB | | from large pool | 2277 MiB | 23464 MiB | 11182 GiB | 11180 GiB | | from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB | |---------------------------------------------------------------------------| | Active memory | 2306 MiB | 23523 MiB | 11222 GiB | 11220 GiB | | from large pool | 2277 MiB | 23464 MiB | 11182 GiB | 11180 GiB | | from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB | |---------------------------------------------------------------------------| | Requested memory | 2284 MiB | 23500 MiB | 11217 GiB | 11214 GiB | | from large pool | 2254 MiB | 23440 MiB | 11177 GiB | 11175 GiB | | from small pool | 29 MiB | 67 MiB | 39 GiB | 39 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 2874 MiB | 26716 MiB | 48040 MiB | 45166 MiB | | from large pool | 2838 MiB | 26648 MiB | 47962 MiB | 45124 MiB | | from small pool | 36 MiB | 68 MiB | 78 MiB | 42 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 580753 KiB | 5456 MiB | 6550 GiB | 6549 GiB | | from large pool | 574132 KiB | 5456 MiB | 6490 GiB | 6490 GiB | | from small pool | 6620 KiB | 18 MiB | 59 GiB | 59 GiB | |---------------------------------------------------------------------------| | Allocations | 555 | 1328 | 557537 | 556982 | | from large pool | 170 | 351 | 361073 | 360903 | | from small pool | 385 | 982 | 196464 | 196079 | |---------------------------------------------------------------------------| | Active allocs | 555 | 1328 | 557537 | 556982 | | from large pool | 170 | 351 | 361073 | 360903 | | from small pool | 385 | 982 | 196464 | 196079 | |---------------------------------------------------------------------------| | GPU reserved segments | 114 | 217 | 227 | 113 | | from large pool | 96 | 183 | 188 | 92 | | from small pool | 18 | 34 | 39 | 21 | |---------------------------------------------------------------------------| | Non-releasable allocs | 104 | 223 | 263603 | 263499 | | from large pool | 96 | 157 | 174967 | 174871 | | from small pool | 8 | 68 | 88636 | 88628 | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| Mon Feb 17 20:06:56 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 55C P3 31W / 81W | 3157MiB / 12282MiB | 1% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 4184 C ...les\LibreOffice\program\soffice.bin N/A | | 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A | +-----------------------------------------------------------------------------------------+
In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from trl import DPOConfig, DPOTrainer
output_dir = "./results_dpo9"
latest_checkpoint = "./results_dpo9/checkpoint-900"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
# Training arguments
training_arguments = DPOConfig(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=3, ##########
optim="paged_adamw_32bit",
learning_rate=8e-6, ########
lr_scheduler_type="cosine",
max_steps=1001,
save_steps=100,
logging_steps=100,
#eval_strategy="steps",
#eval_steps=200,
bf16=True,
gradient_checkpointing=True,
warmup_ratio=0.05,
gradient_checkpointing_kwargs={"use_reentrant": False},
#load_best_model_at_end=True,
#metric_for_best_model="eval_loss",
max_prompt_length=512,
max_length=512,
beta=0.5, # Too high a beta leads to overfitting to the preferences; too low a beta might not align the model well with human feedback
remove_unused_columns=False,
)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
In [14]:
# learning_rate=8e-6 after 900 steps BUT: gradient_accumulation_steps=3
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
max_steps is given, it will override any value given in num_train_epochs
[75/75 12:10]
{'eval_loss': 0.5386355519294739, 'eval_runtime': 742.3444, 'eval_samples_per_second': 0.799, 'eval_steps_per_second': 0.101, 'eval_rewards/chosen': 0.0023878414649516344, 'eval_rewards/rejected': -0.704483151435852, 'eval_rewards/accuracies': 0.7383333444595337, 'eval_rewards/margins': 0.706870973110199, 'eval_logps/rejected': -399.12103271484375, 'eval_logps/chosen': -367.9290771484375, 'eval_logits/rejected': -0.54879230260849, 'eval_logits/chosen': -0.2736295461654663}
In [15]:
# learning_rate=8e-6 after 800 steps BUT: gradient_accumulation_steps=3
latest_checkpoint = "./results_dpo9/checkpoint-800"
model = AutoPeftModelForCausalLM.from_pretrained(
latest_checkpoint,
low_cpu_mem_usage=True,
device_map="auto",
)
dpo_trainer = DPOTrainer(
model=model, # Use the PEFT model directly
tokenizer=tokenizer,
args=training_arguments,
train_dataset=dpo_dataset,
)
# set the eval_dataset to tokenized_test_dataset with correct keys
dpo_trainer.eval_dataset = tokenized_test_dataset
# Evaluate:
eval_results = dpo_trainer.evaluate()
print(eval_results)
print (torch.cuda.memory_summary())
!nvidia-smi
C:\Users\alexa\miniconda3\envs\dpo_env\lib\site-packages\accelerate\utils\modeling.py:1384: UserWarning: Current model requires 402656256 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True. warnings.warn( Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. max_steps is given, it will override any value given in num_train_epochs
[75/75 12:08]
{'eval_loss': 0.5402924418449402, 'eval_runtime': 738.7521, 'eval_samples_per_second': 0.803, 'eval_steps_per_second': 0.102, 'eval_rewards/chosen': -0.04961897060275078, 'eval_rewards/rejected': -0.7467493414878845, 'eval_rewards/accuracies': 0.746666669845581, 'eval_rewards/margins': 0.6971304416656494, 'eval_logps/rejected': -399.20556640625, 'eval_logps/chosen': -368.0331115722656, 'eval_logits/rejected': -0.5474064350128174, 'eval_logits/chosen': -0.2720312774181366} |===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 4612 MiB | 23553 MiB | 22463 GiB | 22458 GiB | | from large pool | 4553 MiB | 23494 MiB | 22384 GiB | 22379 GiB | | from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB | |---------------------------------------------------------------------------| | Active memory | 4612 MiB | 23553 MiB | 22463 GiB | 22458 GiB | | from large pool | 4553 MiB | 23494 MiB | 22384 GiB | 22379 GiB | | from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB | |---------------------------------------------------------------------------| | Requested memory | 4559 MiB | 23500 MiB | 22433 GiB | 22429 GiB | | from large pool | 4500 MiB | 23440 MiB | 22354 GiB | 22350 GiB | | from small pool | 59 MiB | 67 MiB | 78 GiB | 78 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 26656 MiB | 26716 MiB | 71822 MiB | 45166 MiB | | from large pool | 26588 MiB | 26648 MiB | 71712 MiB | 45124 MiB | | from small pool | 68 MiB | 68 MiB | 110 MiB | 42 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 206114 KiB | 5456 MiB | 13183 GiB | 13183 GiB | | from large pool | 205161 KiB | 5456 MiB | 13079 GiB | 13078 GiB | | from small pool | 953 KiB | 19 MiB | 104 GiB | 104 GiB | |---------------------------------------------------------------------------| | Allocations | 1109 | 1328 | 1115 K | 1113 K | | from large pool | 339 | 351 | 722 K | 721 K | | from small pool | 770 | 982 | 392 K | 392 K | |---------------------------------------------------------------------------| | Active allocs | 1109 | 1328 | 1115 K | 1113 K | | from large pool | 339 | 351 | 722 K | 721 K | | from small pool | 770 | 982 | 392 K | 392 K | |---------------------------------------------------------------------------| | GPU reserved segments | 214 | 217 | 327 | 113 | | from large pool | 180 | 183 | 272 | 92 | | from small pool | 34 | 34 | 55 | 21 | |---------------------------------------------------------------------------| | Non-releasable allocs | 126 | 223 | 525783 | 525657 | | from large pool | 124 | 157 | 351861 | 351737 | | from small pool | 2 | 68 | 173922 | 173920 | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| Mon Feb 17 20:37:53 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 566.36 Driver Version: 566.36 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 4080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 56C P0 41W / 85W | 11726MiB / 12282MiB | 100% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 11088 C ...\miniconda3\envs\dpo_env\python.exe N/A | +-----------------------------------------------------------------------------------------+
In [ ]: