In [1]:
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print(torch.cuda.get_device_name(0))
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
import bitsandbytes
import peft
import transformers
print(transformers.__version__)
print(f"bitsandbytes version: {bitsandbytes.__version__}")
print(f"peft version: {peft.__version__}")
print(torch.cuda.is_bf16_supported())
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)] PyTorch Version: 2.5.1+cu121 True 1 CUDA Version: 12.1 NVIDIA GeForce RTX 4080 Laptop GPU 4.50.0.dev0 bitsandbytes version: 0.45.3 peft version: 0.15.2.dev0 True
In [2]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
imdb_dataset = imdb_dataset.rename_column("label", "labels")
# Split the test set into validation and test sets
test_val_split = imdb_dataset['test'].train_test_split(test_size=0.95, seed=42)
imdb_dataset['validation'] = test_val_split['train']
imdb_dataset['test'] = test_val_split['test']
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
# Determine the number of labels
num_labels = len(set(imdb_dataset["train"]["labels"]))
print(f"Number of labels: {num_labels}")
# Load the tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# Tokenize the whole dataset, truncate to 384 tokens
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, max_length=384)
dataset_encoded = imdb_dataset.map(tokenize, batched=True, batch_size=None)
# Load the pretrained model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))
#print(model)
Number of labels: 2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [3]:
# Helper functions
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
def count_trainable_parameters(model):
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total_params, trainable_params, 100 * trainable_params / total_params
def freeze_model_layers(model, unfreeze_pre_classifier=False):
# Freeze all parameters
for param in model.parameters():
param.requires_grad = False
# Unfreeze LoRA and DoRA-specific params, including lora_norm
for name, param in model.named_parameters():
if (
"lora.A" in name
or "lora.B" in name
or "lora_norm" in name
or name.endswith(".m") # For DoRA
or name.endswith(".m_in") # For DDoRA
or name.endswith(".m_out") # For DDoRA
or "scale" in name
):
param.requires_grad = True
# Unfreeze classifier layer (always)
for name, param in model.named_parameters():
if name.startswith("classifier."):
param.requires_grad = True
# unfreeze pre-classifier
if unfreeze_pre_classifier:
for name, param in model.named_parameters():
if name.startswith("pre_classifier."):
param.requires_grad = True
In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)
class LoRALayer(nn.Module):
def __init__(self, in_dim, out_dim, rank, alpha, dropout_rate=0.0):
super().__init__()
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
self.B = nn.Parameter(1e-4 * torch.randn(rank, out_dim) * std_dev) # not all zeroes!
self.alpha = alpha
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
dropped = self.dropout(x @ self.A)
return self.alpha * (dropped @ self.B)
class LinearWithDoubleDoRA(nn.Module):
def __init__(self, linear, rank, alpha, scaling_factor=1.0):
super().__init__()
self.linear = linear
self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.m_out = nn.Parameter(torch.randn(1, linear.out_features) * std_dev)
self.m_in = nn.Parameter(torch.randn(linear.in_features, 1) * std_dev)
# Orthogonal initialization for m_out
#self.m_out = nn.Parameter(torch.empty(1, linear.out_features))
#nn.init.orthogonal_(self.m_out)
# Orthogonal initialization for m_in
#self.m_in = nn.Parameter(torch.empty(linear.in_features, 1))
#nn.init.orthogonal_(self.m_in)
self.scale_out = nn.Parameter(torch.full((1, linear.out_features), float(scaling_factor)))
self.scale_in = nn.Parameter(torch.full((linear.in_features, 1), float(scaling_factor)))
self.last_lora_output_norm = 0.0 # For monitoring
def forward(self, x):
scaled_x = x * self.scale_in.T * self.m_in.T
linear_output = self.linear(x)
lora_output = self.lora(scaled_x)
lora_output = F.dropout(lora_output, p=0.02) #prevents overfitting to small artifacts, spreads useful signal across more of the low-rank space
lora_output_norm = lora_output / (lora_output.norm(p=2, dim=1, keepdim=True) + 1e-9)
self.last_lora_output_norm = lora_output.norm(p=2, dim=-1).mean().item()
dora_modification = self.scale_out * self.m_out * lora_output_norm
return linear_output + dora_modification
def inject_ddora_all_attn(model, rank, alpha, scaling_factor=1.0, dropout_rate=0.0, disable_layers=None):
target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin", "ffn.lin1", "ffn.lin2"]
#target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"]
if disable_layers is None:
disable_layers = []
for name, module in model.named_modules():
if isinstance(module, nn.Linear) and any(layer in name for layer in target_layers):
# Try to extract layer index from names like "transformer.layer.4.attention.q_lin"
parts = name.split('.')
layer_idx = None
for i, part in enumerate(parts):
if part == "layer" and i + 1 < len(parts):
try:
layer_idx = int(parts[i + 1])
break
except ValueError:
pass
if layer_idx is not None and layer_idx in disable_layers:
continue
parent_name = name.rsplit('.', 1)[0]
parent_module = model.get_submodule(parent_name)
original_linear = getattr(parent_module, name.split('.')[-1])
ddora_layer = LinearWithDoubleDoRA(original_linear, rank, alpha, scaling_factor)
ddora_layer.lora.dropout = nn.Dropout(dropout_rate)
setattr(parent_module, name.split('.')[-1], ddora_layer)
return model
In [5]:
def monitor_lora_parameters(model, threshold=1e-7):
monitor = {
"A_abs_mean": [],
"B_abs_mean": [],
"A_grad_mean": [],
"B_grad_mean": [],
"lora_output_norm": [],
"B_nonzero_count": [],
}
hooks = []
for name, module in model.named_modules():
if hasattr(module, "lora") and hasattr(module.lora, "A") and hasattr(module.lora, "B"):
A_param = module.lora.A
B_param = module.lora.B
# Gradient hooks (directly on nn.Parameter)
if A_param.requires_grad:
hooks.append(A_param.register_hook(lambda grad, n=name: monitor["A_grad_mean"].append((n, grad.abs().mean().item()))))
if B_param.requires_grad:
hooks.append(B_param.register_hook(lambda grad, n=name: monitor["B_grad_mean"].append((n, grad.abs().mean().item()))))
# Forward hook for value stats
def forward_hook(mod, inp, out, n=name):
A_mean = mod.lora.A.abs().mean().item()
B_mean = mod.lora.B.abs().mean().item()
B_nnz = (mod.lora.B.abs() > threshold).sum().item()
monitor["A_abs_mean"].append((n, A_mean))
monitor["B_abs_mean"].append((n, B_mean))
monitor["B_nonzero_count"].append((n, B_nnz))
monitor["lora_output_norm"].append((n, mod.last_lora_output_norm))
hooks.append(module.register_forward_hook(forward_hook))
return hooks, monitor
from transformers import TrainingArguments
def monitor_gradients(model):
hooks = []
gradient_history = {}
for name, param in model.named_parameters():
if param.requires_grad:
gradient_history[name] = []
def get_hook(n): # capture the name immediately
def hook(grad):
gradient_history[n].append(grad.abs().mean().item())
return hook
hooks.append(param.register_hook(get_hook(name)))
return hooks, gradient_history
In [6]:
learning_rate = 1e-2 #############
dropout = 0.3 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
output_dir_prefix = "finetuned-imdb-"
import copy
torch.manual_seed(137)
model_ddora_all_attn = copy.deepcopy(model)
model_ddora_all_attn = inject_ddora_all_attn(model_ddora_all_attn, lora_rank, lora_alpha, scaling_factor, dropout)
freeze_model_layers(model_ddora_all_attn, unfreeze_pre_classifier=True)
total_params_ddora, trainable_params_ddora, percentage_ddora = count_trainable_parameters(model_ddora_all_attn)
print(f"\nDDoRA (All Attention) - Total parameters: {total_params_ddora:,}")
print(f"DDoRA (All Attention) - Trainable parameters: {trainable_params_ddora:,} ({percentage_ddora:.2f}%)")
# Sanity check
#print("\nTrainable parameters after freezing:")
#for name, param in model_ddora_all_attn.named_parameters():
# if param.requires_grad:
# print(name)
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=100,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0, #####
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
DDoRA (All Attention) - Total parameters: 68,448,002 DDoRA (All Attention) - Trainable parameters: 2,085,122 (3.05%)
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:50:22, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.561800 | 0.287717 | 0.876000 | 0.876094 |
100 | 0.323400 | 0.282308 | 0.882400 | 0.882489 |
150 | 0.346500 | 0.297467 | 0.866400 | 0.866676 |
200 | 0.270500 | 0.269234 | 0.892800 | 0.892974 |
250 | 0.291600 | 0.245029 | 0.899200 | 0.899337 |
300 | 0.277300 | 0.250809 | 0.900800 | 0.899979 |
350 | 0.270500 | 0.230730 | 0.908800 | 0.908724 |
400 | 0.238900 | 0.254672 | 0.898400 | 0.898477 |
450 | 0.252800 | 0.285514 | 0.893600 | 0.892665 |
500 | 0.274800 | 0.250106 | 0.900800 | 0.900514 |
550 | 0.255900 | 0.227513 | 0.911200 | 0.911193 |
600 | 0.266800 | 0.233832 | 0.908000 | 0.907561 |
650 | 0.234700 | 0.232209 | 0.913600 | 0.913390 |
700 | 0.226500 | 0.221549 | 0.914400 | 0.914380 |
750 | 0.242300 | 0.276095 | 0.901600 | 0.900867 |
800 | 0.232700 | 0.254431 | 0.902400 | 0.901749 |
850 | 0.211900 | 0.214088 | 0.918400 | 0.918435 |
900 | 0.194700 | 0.234711 | 0.920000 | 0.920035 |
950 | 0.214000 | 0.216997 | 0.921600 | 0.921600 |
1000 | 0.193900 | 0.204417 | 0.922400 | 0.922382 |
1050 | 0.197700 | 0.208215 | 0.920000 | 0.919934 |
1100 | 0.204300 | 0.211439 | 0.918400 | 0.918287 |
1150 | 0.178300 | 0.199018 | 0.925600 | 0.925504 |
1200 | 0.191200 | 0.211427 | 0.921600 | 0.921655 |
1250 | 0.190500 | 0.201096 | 0.924800 | 0.924751 |
1300 | 0.199400 | 0.199531 | 0.924000 | 0.923930 |
1350 | 0.199300 | 0.202566 | 0.920800 | 0.920860 |
1400 | 0.190200 | 0.193426 | 0.925600 | 0.925583 |
1450 | 0.180300 | 0.203777 | 0.924000 | 0.923970 |
1500 | 0.188700 | 0.196448 | 0.924000 | 0.923944 |
1550 | 0.182900 | 0.196552 | 0.926400 | 0.926365 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 564070 KiB | 12777 MiB | 131243 GiB | 131243 GiB | | from large pool | 546048 KiB | 12712 MiB | 130755 GiB | 130755 GiB | | from small pool | 18022 KiB | 67 MiB | 488 GiB | 487 GiB | |---------------------------------------------------------------------------| | Active memory | 564070 KiB | 12777 MiB | 131243 GiB | 131243 GiB | | from large pool | 546048 KiB | 12712 MiB | 130755 GiB | 130755 GiB | | from small pool | 18022 KiB | 67 MiB | 488 GiB | 487 GiB | |---------------------------------------------------------------------------| | Requested memory | 561856 KiB | 12774 MiB | 131121 GiB | 131120 GiB | | from large pool | 543836 KiB | 12708 MiB | 130635 GiB | 130635 GiB | | from small pool | 18020 KiB | 67 MiB | 485 GiB | 485 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 13058 MiB | 0 B | | from large pool | 12988 MiB | 12988 MiB | 12988 MiB | 0 B | | from small pool | 70 MiB | 70 MiB | 70 MiB | 0 B | |---------------------------------------------------------------------------| | Non-releasable memory | 183450 KiB | 337456 KiB | 21236 GiB | 21236 GiB | | from large pool | 174848 KiB | 305536 KiB | 20712 GiB | 20711 GiB | | from small pool | 8602 KiB | 43066 KiB | 524 GiB | 524 GiB | |---------------------------------------------------------------------------| | Allocations | 868 | 1343 | 12097 K | 12097 K | | from large pool | 82 | 334 | 3380 K | 3380 K | | from small pool | 786 | 1235 | 8716 K | 8716 K | |---------------------------------------------------------------------------| | Active allocs | 868 | 1343 | 12097 K | 12097 K | | from large pool | 82 | 334 | 3380 K | 3380 K | | from small pool | 786 | 1235 | 8716 K | 8716 K | |---------------------------------------------------------------------------| | GPU reserved segments | 266 | 266 | 266 | 0 | | from large pool | 231 | 231 | 231 | 0 | | from small pool | 35 | 35 | 35 | 0 | |---------------------------------------------------------------------------| | Non-releasable allocs | 42 | 91 | 5276 K | 5276 K | | from large pool | 19 | 25 | 651 K | 651 K | | from small pool | 23 | 73 | 4625 K | 4625 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2542, |B|=0.1388, |∇A|=9.662e-06, |∇B|=1.556e-05, |LoRA(x)|=1.942e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2517, |B|=0.1378, |∇A|=5.849e-06, |∇B|=1.398e-05, |LoRA(x)|=2.249e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2447, |B|=0.1159, |∇A|=9.926e-06, |∇B|=2.328e-05, |LoRA(x)|=2.345e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2337, |B|=0.1103, |∇A|=1.199e-05, |∇B|=4.393e-05, |LoRA(x)|=1.772e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2507, |B|=0.1355, |∇A|=2.186e-05, |∇B|=1.721e-05, |LoRA(x)|=4.824e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2355, |B|=0.1065, |∇A|=4.293e-06, |∇B|=3.982e-05, |LoRA(x)|=3.453e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2483, |B|=0.1264, |∇A|=1.103e-05, |∇B|=1.93e-05, |LoRA(x)|=1.838e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.24, |B|=0.1291, |∇A|=7.642e-06, |∇B|=1.784e-05, |LoRA(x)|=1.816e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2358, |B|=0.1047, |∇A|=1.237e-05, |∇B|=3.915e-05, |LoRA(x)|=2.235e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.235, |B|=0.1099, |∇A|=1.777e-05, |∇B|=7.302e-05, |LoRA(x)|=1.511e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2506, |B|=0.1333, |∇A|=1.943e-05, |∇B|=2.347e-05, |LoRA(x)|=5.255e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2404, |B|=0.1027, |∇A|=7.746e-06, |∇B|=7.263e-05, |LoRA(x)|=2.11e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2486, |B|=0.1312, |∇A|=9.941e-06, |∇B|=2.366e-05, |LoRA(x)|=2.512e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2511, |B|=0.1393, |∇A|=1.015e-05, |∇B|=2.085e-05, |LoRA(x)|=2.348e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.229, |B|=0.09331, |∇A|=8.467e-06, |∇B|=6.062e-05, |LoRA(x)|=3.14e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2312, |B|=0.1061, |∇A|=2.083e-05, |∇B|=6.639e-05, |LoRA(x)|=1.184e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.2536, |B|=0.136, |∇A|=1.437e-05, |∇B|=2.108e-05, |LoRA(x)|=8.651e+04, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2401, |B|=0.1034, |∇A|=9.229e-06, |∇B|=7.97e-05, |LoRA(x)|=2.362e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.243, |B|=0.1357, |∇A|=1.075e-05, |∇B|=2.669e-05, |LoRA(x)|=2.082e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2531, |B|=0.1389, |∇A|=1.803e-05, |∇B|=2.703e-05, |LoRA(x)|=2.401e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2323, |B|=0.09535, |∇A|=1.354e-05, |∇B|=4.779e-05, |LoRA(x)|=2.079e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2429, |B|=0.1091, |∇A|=3.816e-05, |∇B|=7.215e-05, |LoRA(x)|=1.148e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2364, |B|=0.1223, |∇A|=3.04e-05, |∇B|=3.575e-05, |LoRA(x)|=4.14e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2375, |B|=0.09783, |∇A|=7.788e-06, |∇B|=7.861e-05, |LoRA(x)|=3.256e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2394, |B|=0.1417, |∇A|=1.331e-05, |∇B|=3.725e-05, |LoRA(x)|=2.13e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.237, |B|=0.1337, |∇A|=2.988e-05, |∇B|=2.972e-05, |LoRA(x)|=1.799e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2298, |B|=0.0931, |∇A|=9.967e-06, |∇B|=3.076e-05, |LoRA(x)|=2.543e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.243, |B|=0.1039, |∇A|=3.259e-05, |∇B|=7.151e-05, |LoRA(x)|=1.27e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2386, |B|=0.1142, |∇A|=2.274e-05, |∇B|=2.123e-05, |LoRA(x)|=3.925e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2255, |B|=0.0829, |∇A|=1.652e-06, |∇B|=6.154e-05, |LoRA(x)|=5.862e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2407, |B|=0.1339, |∇A|=9.285e-06, |∇B|=3.57e-05, |LoRA(x)|=2.189e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2407, |B|=0.1156, |∇A|=2.446e-05, |∇B|=1.713e-05, |LoRA(x)|=2.256e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2312, |B|=0.08218, |∇A|=3.595e-06, |∇B|=2.136e-05, |LoRA(x)|=3.667e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2327, |B|=0.08524, |∇A|=7.693e-06, |∇B|=7.511e-05, |LoRA(x)|=1.842e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2192, |B|=0.09016, |∇A|=3.35e-06, |∇B|=1.04e-05, |LoRA(x)|=7.823e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2231, |B|=0.08951, |∇A|=1.167e-06, |∇B|=5.976e-05, |LoRA(x)|=1.17e+05, B≠0=12288
In [7]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2709178924560547 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.16283518075942993 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.26631587743759155 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.16063791513442993 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.25868088006973267 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.13682228326797485 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.24423454701900482 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.13000991940498352 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.26810798048973083 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.16242587566375732 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.24940359592437744 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.12858235836029053 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.26269277930259705 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.14817850291728973 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2528449296951294 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15097709000110626 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.24744336307048798 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12217148393392563 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2444974035024643 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.12724661827087402 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.26778024435043335 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1590876430273056 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2578299641609192 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.12523692846298218 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2628753185272217 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15486016869544983 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.2660772204399109 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.16277363896369934 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23903438448905945 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.10702144354581833 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24155795574188232 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.12425901740789413 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.272270530462265 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.16253915429115295 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.25463196635246277 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12361067533493042 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2559710741043091 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.15975016355514526 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2658747434616089 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.16017454862594604 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2416546791791916 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.10881607234477997 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.25516700744628906 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.12554976344108582 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24647924304008484 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.1423388123512268 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.24841666221618652 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11604660749435425 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2493968904018402 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16505199670791626 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2477170079946518 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.15541145205497742 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.23946581780910492 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.10624637454748154 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.25482916831970215 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11996905505657196 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2472330778837204 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13146373629570007 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23184946179389954 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.09329196065664291 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.24968469142913818 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1556234210729599 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.24908798933029175 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1316540241241455 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.23918092250823975 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.0918576717376709 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24216410517692566 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09585311263799667 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22366289794445038 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.10154710710048676 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23012006282806396 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.1018262729048729 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.1004 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.9228 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.2110 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.5209 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.6727 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 19.3789 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 34.5602 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.3900 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 37.8660 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.8121 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 71.1450 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.3645 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 37.1972 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 21.0926 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 35.7007 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 21.3030 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 34.8564 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.4806 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.3520 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 17.8980 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 37.8233 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.0524 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 73.1258 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 17.8335 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.0488 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.8925 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 37.6010 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.8032 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.6308 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.9832 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 34.1222 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 17.6791 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.5886 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 46.2495 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 72.3670 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.5751 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.0011 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 22.3511 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 37.4962 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 22.5260 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 34.2557 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 15.5532 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.0431 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.7801 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 34.6972 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.1488 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.7553 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.5732 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.1761 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 23.0655 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 34.8911 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.9906 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 33.9133 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 14.9897 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.2084 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 17.1343 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 34.8437 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.3903 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 65.2566 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 13.3532 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 35.0948 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 21.9627 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 35.2049 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 18.9265 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.0101 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 12.8739 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.0780 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.5251 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.3660 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 29.3310 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 64.2580 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 14.8548 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0424585342407227 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9769548177719116 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.04038405418396 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9770689010620117 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9540395736694336 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9575599431991577 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8957542181015015 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.951843023300171 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.035085678100586 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9643043279647827 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.963477373123169 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9799795150756836 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.932100534439087 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9620461463928223 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.915934443473816 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9438493251800537 distilbert.transformer.layer.2.attention.q_lin.scale_out 1.995427131652832 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9748806953430176 distilbert.transformer.layer.2.attention.k_lin.scale_out 1.9886808395385742 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9784845113754272 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.914613127708435 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9549009799957275 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.895020842552185 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9341068267822266 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0155630111694336 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.969996452331543 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.062822103500366 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9458072185516357 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.926612138748169 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.916110873222351 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9155391454696655 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9431451559066772 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.042447566986084 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9267181158065796 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.0660717487335205 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9439681768417358 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.9255082607269287 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9371190071105957 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9535255432128906 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.896983027458191 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9638111591339111 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9459632635116577 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0670223236083984 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9378600120544434 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.923959732055664 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9297794103622437 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.9090628623962402 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9722636938095093 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 57.2453 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2102 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.1743 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 55.2250 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.6507 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.7082 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 52.9878 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.4522 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.9161 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 54.8902 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 54.9656 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.2165 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9133 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7231 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.4724 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.2155 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 55.7693 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 55.1198 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 55.6959 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.2612 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.3445 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.4966 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 52.9000 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.9850 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.3962 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.9642 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.6981 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.3925 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.7325 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.4871 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.4757 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 54.2742 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.2218 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.8044 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.8294 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.1943 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.7764 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.0333 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.6196 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.1163 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.1260 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.3373 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.7072 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 54.0476 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.9670 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.9039 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.5177 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.9539 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.3406547009944916 distilbert.transformer.layer.0.attention.q_lin.m_in 0.27271783351898193 distilbert.transformer.layer.0.attention.k_lin.m_out 0.3363686501979828 distilbert.transformer.layer.0.attention.k_lin.m_in 0.26796919107437134 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2689072787761688 distilbert.transformer.layer.0.attention.v_lin.m_in 0.2510956823825836 distilbert.transformer.layer.0.attention.out_lin.m_out 0.22179457545280457 distilbert.transformer.layer.0.attention.out_lin.m_in 0.23105959594249725 distilbert.transformer.layer.1.attention.q_lin.m_out 0.3155955672264099 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2527567744255066 distilbert.transformer.layer.1.attention.k_lin.m_out 0.282562255859375 distilbert.transformer.layer.1.attention.k_lin.m_in 0.25409388542175293 distilbert.transformer.layer.1.attention.v_lin.m_out 0.23175911605358124 distilbert.transformer.layer.1.attention.v_lin.m_in 0.24593064188957214 distilbert.transformer.layer.1.attention.out_lin.m_out 0.2380572408437729 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23218491673469543 distilbert.transformer.layer.2.attention.q_lin.m_out 0.2917119860649109 distilbert.transformer.layer.2.attention.q_lin.m_in 0.2570725679397583 distilbert.transformer.layer.2.attention.k_lin.m_out 0.3022836744785309 distilbert.transformer.layer.2.attention.k_lin.m_in 0.2744084596633911 distilbert.transformer.layer.2.attention.v_lin.m_out 0.2186194658279419 distilbert.transformer.layer.2.attention.v_lin.m_in 0.2405482530593872 distilbert.transformer.layer.2.attention.out_lin.m_out 0.22129730880260468 distilbert.transformer.layer.2.attention.out_lin.m_in 0.22127607464790344 distilbert.transformer.layer.3.attention.q_lin.m_out 0.31316566467285156 distilbert.transformer.layer.3.attention.q_lin.m_in 0.25650596618652344 distilbert.transformer.layer.3.attention.k_lin.m_out 0.3420139253139496 distilbert.transformer.layer.3.attention.k_lin.m_in 0.2549283504486084 distilbert.transformer.layer.3.attention.v_lin.m_out 0.2223138064146042 distilbert.transformer.layer.3.attention.v_lin.m_in 0.21610304713249207 distilbert.transformer.layer.3.attention.out_lin.m_out 0.22577087581157684 distilbert.transformer.layer.3.attention.out_lin.m_in 0.25172096490859985 distilbert.transformer.layer.4.attention.q_lin.m_out 0.33011841773986816 distilbert.transformer.layer.4.attention.q_lin.m_in 0.22704020142555237 distilbert.transformer.layer.4.attention.k_lin.m_out 0.35280928015708923 distilbert.transformer.layer.4.attention.k_lin.m_in 0.2340744584798813 distilbert.transformer.layer.4.attention.v_lin.m_out 0.23984378576278687 distilbert.transformer.layer.4.attention.v_lin.m_in 0.23275384306907654 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2580145597457886 distilbert.transformer.layer.4.attention.out_lin.m_in 0.22269290685653687 distilbert.transformer.layer.5.attention.q_lin.m_out 0.2887585163116455 distilbert.transformer.layer.5.attention.q_lin.m_in 0.2514832615852356 distilbert.transformer.layer.5.attention.k_lin.m_out 0.3360157012939453 distilbert.transformer.layer.5.attention.k_lin.m_in 0.24031364917755127 distilbert.transformer.layer.5.attention.v_lin.m_out 0.2495293915271759 distilbert.transformer.layer.5.attention.v_lin.m_in 0.2279600352048874 distilbert.transformer.layer.5.attention.out_lin.m_out 0.22755685448646545 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23445087671279907 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 11.7324 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.6954 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 11.5981 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4668 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.5462 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3071 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.1026 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8013 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.8487 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.2985 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.1298 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 8.9646 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.4627 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.0157 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.6411 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7485 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 10.2685 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.2437 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 10.6241 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.7152 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.1927 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.9594 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 7.9492 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.4714 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 10.8409 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.1677 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.5735 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.4100 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.1937 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 8.3381 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.1634 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.4534 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 11.6548 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.5402 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 11.9912 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.6368 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4722 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.7789 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 9.1713 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 8.7746 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.5080 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.1827 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 11.2278 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.7580 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.1483 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.9627 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 8.6252 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4475
In [8]:
def set_all_lora_dropout(model, new_dropout_rate):
for module in model.modules():
if isinstance(module, LoRALayer):
module.dropout.p = new_dropout_rate
def print_dropout_rates(model):
for name, module in model.named_modules():
if isinstance(module, LoRALayer):
print(f"{name}.dropout.p = {module.dropout.p}")
def split_lora_dora_params(model):
lora_A_params = []
lora_B_params = []
m_params = []
scale_params = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if "lora.A" in name:
lora_A_params.append(param)
elif "lora.B" in name:
lora_B_params.append(param)
elif name.endswith("m_in") or name.endswith("m_out"):
m_params.append(param)
elif "scale" in name:
scale_params.append(param)
return {
"lora_A": lora_A_params,
"lora_B": lora_B_params,
"m": m_params,
"scale": scale_params,
}
def create_custom_optimizer(model, base_lr=1e-4, lr_B_scale=10.0, lr_scale_params=0.2, weight_decay=0.01):
param_groups = split_lora_dora_params(model)
optimizer = torch.optim.AdamW([
{"params": param_groups["lora_A"], "lr": base_lr},
{"params": param_groups["lora_B"], "lr": base_lr * lr_B_scale},
{"params": param_groups["m"], "lr": base_lr},
{"params": param_groups["scale"], "lr": base_lr * lr_scale_params},
], weight_decay=weight_decay)
return optimizer
In [9]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=100,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=3e-3, ###########
lr_B_scale=0.5, #############
lr_scale_params=0.75, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:56:52, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.183800 | 0.192741 | 0.927200 | 0.927106 |
100 | 0.150100 | 0.203222 | 0.924800 | 0.924822 |
150 | 0.181700 | 0.194170 | 0.928000 | 0.927914 |
200 | 0.120600 | 0.212781 | 0.926400 | 0.926253 |
250 | 0.145300 | 0.204888 | 0.929600 | 0.929542 |
300 | 0.143700 | 0.193794 | 0.927200 | 0.927205 |
350 | 0.151000 | 0.195130 | 0.923200 | 0.923223 |
400 | 0.118700 | 0.217758 | 0.925600 | 0.925504 |
450 | 0.128400 | 0.222594 | 0.916000 | 0.916093 |
500 | 0.144400 | 0.202495 | 0.928800 | 0.928760 |
550 | 0.133900 | 0.205837 | 0.930400 | 0.930223 |
600 | 0.138100 | 0.200773 | 0.929600 | 0.929542 |
650 | 0.098700 | 0.216366 | 0.923200 | 0.923253 |
700 | 0.110000 | 0.218037 | 0.929600 | 0.929516 |
750 | 0.110500 | 0.219765 | 0.930400 | 0.930384 |
800 | 0.119300 | 0.222164 | 0.922400 | 0.922449 |
850 | 0.092800 | 0.220728 | 0.923200 | 0.923176 |
900 | 0.085000 | 0.223266 | 0.928800 | 0.928708 |
950 | 0.092300 | 0.232182 | 0.928000 | 0.927940 |
1000 | 0.077500 | 0.238128 | 0.921600 | 0.921612 |
1050 | 0.092500 | 0.230383 | 0.928800 | 0.928708 |
1100 | 0.108000 | 0.225004 | 0.924800 | 0.924724 |
1150 | 0.074200 | 0.225988 | 0.925600 | 0.925570 |
1200 | 0.103900 | 0.228481 | 0.924800 | 0.924800 |
1250 | 0.098600 | 0.224238 | 0.928000 | 0.927953 |
1300 | 0.111800 | 0.224987 | 0.925600 | 0.925558 |
1350 | 0.118100 | 0.219593 | 0.926400 | 0.926283 |
1400 | 0.128100 | 0.213206 | 0.925600 | 0.925583 |
1450 | 0.147000 | 0.212475 | 0.926400 | 0.926389 |
1500 | 0.150800 | 0.209083 | 0.930400 | 0.930349 |
1550 | 0.158300 | 0.209505 | 0.928000 | 0.927965 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12777 MiB | 262483 GiB | 262482 GiB | | from large pool | 541440 KiB | 12712 MiB | 261507 GiB | 261506 GiB | | from small pool | 18003 KiB | 67 MiB | 976 GiB | 975 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12777 MiB | 262483 GiB | 262482 GiB | | from large pool | 541440 KiB | 12712 MiB | 261507 GiB | 261506 GiB | | from small pool | 18003 KiB | 67 MiB | 976 GiB | 975 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12774 MiB | 262238 GiB | 262238 GiB | | from large pool | 539228 KiB | 12708 MiB | 261267 GiB | 261267 GiB | | from small pool | 18002 KiB | 67 MiB | 971 GiB | 971 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 25404 MiB | 12346 MiB | | from large pool | 12988 MiB | 12988 MiB | 25272 MiB | 12284 MiB | | from small pool | 70 MiB | 70 MiB | 132 MiB | 62 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186029 KiB | 403075 KiB | 43448 GiB | 43447 GiB | | from large pool | 179456 KiB | 396800 KiB | 42397 GiB | 42397 GiB | | from small pool | 6573 KiB | 43066 KiB | 1050 GiB | 1050 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1343 | 24189 K | 24188 K | | from large pool | 80 | 334 | 6760 K | 6760 K | | from small pool | 780 | 1235 | 17429 K | 17428 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1343 | 24189 K | 24188 K | | from large pool | 80 | 334 | 6760 K | 6760 K | | from small pool | 780 | 1235 | 17429 K | 17428 K | |---------------------------------------------------------------------------| | GPU reserved segments | 263 | 266 | 503 | 240 | | from large pool | 228 | 231 | 437 | 209 | | from small pool | 35 | 35 | 66 | 31 | |---------------------------------------------------------------------------| | Non-releasable allocs | 39 | 91 | 10682 K | 10682 K | | from large pool | 19 | 25 | 1321 K | 1321 K | | from small pool | 20 | 73 | 9361 K | 9361 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2766, |B|=0.1648, |∇A|=1.822e-05, |∇B|=1.927e-05, |LoRA(x)|=2.131e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2717, |B|=0.1628, |∇A|=9.544e-06, |∇B|=1.661e-05, |LoRA(x)|=2.47e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2634, |B|=0.1392, |∇A|=1.583e-05, |∇B|=2.336e-05, |LoRA(x)|=2.493e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2489, |B|=0.133, |∇A|=2.148e-05, |∇B|=4.675e-05, |LoRA(x)|=1.355e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2744, |B|=0.1644, |∇A|=3.302e-05, |∇B|=2.011e-05, |LoRA(x)|=5.029e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2544, |B|=0.1311, |∇A|=9.187e-06, |∇B|=4.521e-05, |LoRA(x)|=2.962e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2685, |B|=0.1504, |∇A|=1.795e-05, |∇B|=2.067e-05, |LoRA(x)|=1.886e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2579, |B|=0.1532, |∇A|=1.2e-05, |∇B|=2.014e-05, |LoRA(x)|=1.907e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2526, |B|=0.1249, |∇A|=1.726e-05, |∇B|=3.479e-05, |LoRA(x)|=2.549e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2486, |B|=0.13, |∇A|=2.062e-05, |∇B|=5.314e-05, |LoRA(x)|=1.714e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.273, |B|=0.1612, |∇A|=3.055e-05, |∇B|=2.66e-05, |LoRA(x)|=5.169e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.263, |B|=0.1282, |∇A|=2.077e-05, |∇B|=9.076e-05, |LoRA(x)|=1.153e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2688, |B|=0.157, |∇A|=1.398e-05, |∇B|=2.323e-05, |LoRA(x)|=2.649e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2723, |B|=0.1652, |∇A|=1.601e-05, |∇B|=2.355e-05, |LoRA(x)|=2.624e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2425, |B|=0.11, |∇A|=7.563e-06, |∇B|=2.657e-05, |LoRA(x)|=3.606e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2457, |B|=0.1274, |∇A|=2.761e-05, |∇B|=5.052e-05, |LoRA(x)|=1.18e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.2775, |B|=0.1647, |∇A|=2.621e-05, |∇B|=2.474e-05, |LoRA(x)|=6.946e+04, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2588, |B|=0.1265, |∇A|=2.077e-05, |∇B|=7.816e-05, |LoRA(x)|=1.667e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2616, |B|=0.1618, |∇A|=1.442e-05, |∇B|=2.771e-05, |LoRA(x)|=2.31e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2712, |B|=0.1625, |∇A|=2.371e-05, |∇B|=2.745e-05, |LoRA(x)|=2.818e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.246, |B|=0.1116, |∇A|=1.432e-05, |∇B|=2.389e-05, |LoRA(x)|=2.312e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2606, |B|=0.1285, |∇A|=4.064e-05, |∇B|=4.222e-05, |LoRA(x)|=1.318e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2516, |B|=0.1448, |∇A|=2.834e-05, |∇B|=2.456e-05, |LoRA(x)|=4.688e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2519, |B|=0.1191, |∇A|=1.329e-05, |∇B|=5.109e-05, |LoRA(x)|=2.704e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2542, |B|=0.1669, |∇A|=1.183e-05, |∇B|=2.58e-05, |LoRA(x)|=2.796e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2533, |B|=0.1575, |∇A|=3.202e-05, |∇B|=2.64e-05, |LoRA(x)|=2.156e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2443, |B|=0.1099, |∇A|=1.412e-05, |∇B|=1.884e-05, |LoRA(x)|=2.271e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2588, |B|=0.1233, |∇A|=3.874e-05, |∇B|=3.313e-05, |LoRA(x)|=1.536e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2514, |B|=0.1342, |∇A|=1.91e-05, |∇B|=1.335e-05, |LoRA(x)|=4.969e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2338, |B|=0.09712, |∇A|=2.099e-06, |∇B|=1.596e-05, |LoRA(x)|=6.15e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.255, |B|=0.1575, |∇A|=1.022e-05, |∇B|=2.607e-05, |LoRA(x)|=2.573e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2531, |B|=0.1351, |∇A|=2.564e-05, |∇B|=1.366e-05, |LoRA(x)|=2.721e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.243, |B|=0.09413, |∇A|=5.856e-06, |∇B|=6.719e-06, |LoRA(x)|=3.893e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2476, |B|=0.09841, |∇A|=1.116e-05, |∇B|=1.517e-05, |LoRA(x)|=1.939e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2302, |B|=0.1052, |∇A|=3.538e-06, |∇B|=4.697e-06, |LoRA(x)|=7.538e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2344, |B|=0.1046, |∇A|=4.669e-07, |∇B|=8.693e-06, |LoRA(x)|=1.606e+05, B≠0=12288
In [10]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.27112382650375366 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.1630639284849167 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.26672476530075073 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1610177606344223 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.25969889760017395 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.13719330728054047 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.24477171897888184 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1304492950439453 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.26895368099212646 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.16271314024925232 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2498529851436615 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.12885072827339172 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.26298922300338745 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.14843596518039703 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2530231475830078 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15127363801002502 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.24832753837108612 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12246879935264587 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.244970440864563 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.12774094939231873 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2681034505367279 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.15937507152557373 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2582159638404846 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.12570586800575256 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.26331281661987305 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15518896281719208 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.26663994789123535 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1629818081855774 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23866932094097137 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.10743415355682373 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24211303889751434 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.12474947422742844 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2726806402206421 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.16278992593288422 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2547855079174042 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12400786578655243 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2568298876285553 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.15992189943790436 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.26599007844924927 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.16072803735733032 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.24225488305091858 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.10922425985336304 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.25584137439727783 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.12585961818695068 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24706082046031952 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.1427352875471115 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2488764226436615 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11672824621200562 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.24979417026042938 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16525492072105408 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.24865584075450897 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1557541936635971 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.23983024060726166 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.1067819595336914 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2550323009490967 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.12050957977771759 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.24768595397472382 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13194093108177185 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2323710173368454 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.09379933774471283 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2502736449241638 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.15573155879974365 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.24944671988487244 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.13200370967388153 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2393648475408554 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09190337359905243 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24240228533744812 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09614896774291992 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22430342435836792 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.10205920785665512 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23144260048866272 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.10230254381895065 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.1368 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.9601 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.2745 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.5603 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.7953 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 19.4329 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 34.6474 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.4435 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 37.9731 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.8902 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 71.2854 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.4120 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 37.2484 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 21.1275 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 35.7473 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 21.3421 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 34.9519 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.5259 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.4125 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 17.9544 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 37.8841 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.1256 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 73.2333 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 17.8865 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.0931 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.9345 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 37.6618 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.8409 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.6147 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 15.0488 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 34.1995 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 17.7327 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.6539 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 46.3207 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 72.4403 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.6273 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.0927 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 22.3834 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 37.5350 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 22.5666 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 34.3272 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 15.6164 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.1461 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.8334 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 34.7769 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.2458 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.9050 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.6516 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.2357 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 23.1006 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 35.0209 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 22.0409 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 33.9457 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 15.0591 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.2503 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 17.1982 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 34.9368 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.5159 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 65.3953 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 13.4135 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 35.1958 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 21.9981 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 35.2734 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 18.9867 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.0502 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 12.9391 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.1047 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.6182 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.4527 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 29.4683 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 64.5749 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 14.9198 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0465407371520996 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9752111434936523 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0412023067474365 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9757541418075562 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9554919004440308 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9565114974975586 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8965394496917725 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9504833221435547 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.0365495681762695 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9632136821746826 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.964491844177246 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.978514552116394 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9338481426239014 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9615120887756348 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.916195273399353 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9427350759506226 distilbert.transformer.layer.2.attention.q_lin.scale_out 1.9956350326538086 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9740338325500488 distilbert.transformer.layer.2.attention.k_lin.scale_out 1.9878133535385132 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.977526068687439 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.913891077041626 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9519472122192383 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.8938149213790894 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9330028295516968 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0180916786193848 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9704923629760742 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0638022422790527 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.944108486175537 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9252495765686035 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9149510860443115 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9138020277023315 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9416310787200928 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0417110919952393 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.924926996231079 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.0672411918640137 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9433913230895996 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.9220985174179077 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.934260368347168 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9482431411743164 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8934226036071777 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9617071151733398 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9448035955429077 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0657835006713867 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.935713768005371 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.921055555343628 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.927734136581421 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.904882788658142 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9699764251708984 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 57.3685 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.1691 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.1965 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 55.1928 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.7002 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.6864 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.0231 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.4218 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.9646 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 54.8670 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 54.9941 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1790 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9677 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7122 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.4895 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1899 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 55.7793 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 55.0983 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 55.6772 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.2406 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.3336 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.4208 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 52.8776 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.9624 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.4743 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.9773 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.7329 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.3512 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.7000 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.4600 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.4338 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 54.2416 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.2060 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7611 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.8632 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.1837 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.6946 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.9613 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.4798 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.0302 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.0736 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.3139 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.6768 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.9956 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.8977 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.8537 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.4194 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.8976 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.3474145829677582 distilbert.transformer.layer.0.attention.q_lin.m_in 0.2722855806350708 distilbert.transformer.layer.0.attention.k_lin.m_out 0.3388528823852539 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2681219279766083 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2731800973415375 distilbert.transformer.layer.0.attention.v_lin.m_in 0.25259310007095337 distilbert.transformer.layer.0.attention.out_lin.m_out 0.226003497838974 distilbert.transformer.layer.0.attention.out_lin.m_in 0.2322026491165161 distilbert.transformer.layer.1.attention.q_lin.m_out 0.31867843866348267 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2533246576786041 distilbert.transformer.layer.1.attention.k_lin.m_out 0.28546828031539917 distilbert.transformer.layer.1.attention.k_lin.m_in 0.25377875566482544 distilbert.transformer.layer.1.attention.v_lin.m_out 0.23614013195037842 distilbert.transformer.layer.1.attention.v_lin.m_in 0.24751576781272888 distilbert.transformer.layer.1.attention.out_lin.m_out 0.2409364879131317 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23296980559825897 distilbert.transformer.layer.2.attention.q_lin.m_out 0.293522447347641 distilbert.transformer.layer.2.attention.q_lin.m_in 0.25772082805633545 distilbert.transformer.layer.2.attention.k_lin.m_out 0.3025796413421631 distilbert.transformer.layer.2.attention.k_lin.m_in 0.27497267723083496 distilbert.transformer.layer.2.attention.v_lin.m_out 0.22029690444469452 distilbert.transformer.layer.2.attention.v_lin.m_in 0.23904626071453094 distilbert.transformer.layer.2.attention.out_lin.m_out 0.22309017181396484 distilbert.transformer.layer.2.attention.out_lin.m_in 0.22285161912441254 distilbert.transformer.layer.3.attention.q_lin.m_out 0.31796714663505554 distilbert.transformer.layer.3.attention.q_lin.m_in 0.25912511348724365 distilbert.transformer.layer.3.attention.k_lin.m_out 0.34453338384628296 distilbert.transformer.layer.3.attention.k_lin.m_in 0.2547050714492798 distilbert.transformer.layer.3.attention.v_lin.m_out 0.2236625999212265 distilbert.transformer.layer.3.attention.v_lin.m_in 0.2172790765762329 distilbert.transformer.layer.3.attention.out_lin.m_out 0.22678808867931366 distilbert.transformer.layer.3.attention.out_lin.m_in 0.25232821702957153 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3307022154331207 distilbert.transformer.layer.4.attention.q_lin.m_in 0.22706055641174316 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3556174635887146 distilbert.transformer.layer.4.attention.k_lin.m_in 0.23558568954467773 distilbert.transformer.layer.4.attention.v_lin.m_out 0.2392875999212265 distilbert.transformer.layer.4.attention.v_lin.m_in 0.23174671828746796 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2541300058364868 distilbert.transformer.layer.4.attention.out_lin.m_in 0.22204169631004333 distilbert.transformer.layer.5.attention.q_lin.m_out 0.28805413842201233 distilbert.transformer.layer.5.attention.q_lin.m_in 0.25228437781333923 distilbert.transformer.layer.5.attention.k_lin.m_out 0.335427463054657 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23985588550567627 distilbert.transformer.layer.5.attention.v_lin.m_out 0.24964062869548798 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22856232523918152 distilbert.transformer.layer.5.attention.out_lin.m_out 0.2270231693983078 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23365651071071625 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 11.9316 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.7012 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 11.6341 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4691 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.6691 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3389 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.2296 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8311 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.9564 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.3235 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.1870 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 8.9705 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.5747 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.0465 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.7393 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7452 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 10.3104 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.2655 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 10.6525 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.7387 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.2576 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.9262 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 7.9964 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.5101 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 10.9827 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.2065 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.6543 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.4198 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.2145 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 8.3746 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.1837 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.4723 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 11.6686 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.5626 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.0515 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.6679 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4655 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.7757 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 9.0607 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 8.7684 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.4844 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.2177 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 11.2213 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.7647 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.1354 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.9712 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 8.6173 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4556
Try to reset training¶
In order to allow the LLM start afresh, set the LoRA params in Layer 5 to random vakues. Please note that the values of lora.B are initialised with smaller values than lora.A
In [11]:
def reinit_ddora_in_layer(model, layer_idx_to_reinit=5, std=0.1):
for name, module in model.named_modules():
if isinstance(module, LinearWithDoubleDoRA) and f"layer.{layer_idx_to_reinit}." in name:
# Reinitialize LoRA A and B
torch.nn.init.normal_(module.lora.A, mean=0.0, std=std)
torch.nn.init.normal_(module.lora.B, mean=0.0, std=0.1*std) ########
print(f"Reinitialized weights in {name}")
reinit_ddora_in_layer(model_ddora_all_attn, layer_idx_to_reinit=5, std=0.1)
# Sanity check
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
Reinitialized weights in distilbert.transformer.layer.5.attention.q_lin Reinitialized weights in distilbert.transformer.layer.5.attention.k_lin Reinitialized weights in distilbert.transformer.layer.5.attention.v_lin Reinitialized weights in distilbert.transformer.layer.5.attention.out_lin Reinitialized weights in distilbert.transformer.layer.5.ffn.lin1 Reinitialized weights in distilbert.transformer.layer.5.ffn.lin2 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.27112382650375366 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.1630639284849167 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.26672476530075073 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1610177606344223 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.25969889760017395 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.13719330728054047 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.24477171897888184 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1304492950439453 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.26895368099212646 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.16271314024925232 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2498529851436615 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.12885072827339172 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.26298922300338745 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.14843596518039703 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2530231475830078 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15127363801002502 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.24832753837108612 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12246879935264587 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.244970440864563 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.12774094939231873 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2681034505367279 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.15937507152557373 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2582159638404846 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.12570586800575256 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.26331281661987305 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15518896281719208 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.26663994789123535 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1629818081855774 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23866932094097137 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.10743415355682373 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24211303889751434 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.12474947422742844 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2726806402206421 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.16278992593288422 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2547855079174042 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12400786578655243 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2568298876285553 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.15992189943790436 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.26599007844924927 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.16072803735733032 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.24225488305091858 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.10922425985336304 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.25584137439727783 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.12585961818695068 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24706082046031952 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.1427352875471115 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2488764226436615 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11672824621200562 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.24979417026042938 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16525492072105408 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.24865584075450897 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1557541936635971 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.23983024060726166 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.1067819595336914 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2550323009490967 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.12050957977771759 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.24768595397472382 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13194093108177185 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2323710173368454 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.09379933774471283 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.07994435727596283 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.00801579188555479 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.07955452054738998 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.00791049562394619 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.080181784927845 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.007992936298251152 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.08023684471845627 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.008007392287254333 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.07936650514602661 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.008047424256801605 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.07979173213243484 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.008014846593141556
In [12]:
from transformers import TrainingArguments
learning_rate = 0.01
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=100,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=0.01, ###########
lr_B_scale=1.0, #############
lr_scale_params=1.0, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:55:52, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.160700 | 0.218726 | 0.912800 | 0.912780 |
100 | 0.171300 | 0.222383 | 0.906400 | 0.906569 |
150 | 0.213900 | 0.236124 | 0.904800 | 0.904978 |
200 | 0.150800 | 0.289784 | 0.908000 | 0.907127 |
250 | 0.180000 | 0.214728 | 0.922400 | 0.922406 |
300 | 0.174000 | 0.204650 | 0.925600 | 0.925341 |
350 | 0.187800 | 0.207140 | 0.917600 | 0.917641 |
400 | 0.134700 | 0.236044 | 0.910400 | 0.910570 |
450 | 0.162000 | 0.221878 | 0.920800 | 0.920794 |
500 | 0.176500 | 0.212989 | 0.924800 | 0.924710 |
550 | 0.161800 | 0.222250 | 0.924800 | 0.924548 |
600 | 0.180800 | 0.195855 | 0.928800 | 0.928722 |
650 | 0.143300 | 0.219471 | 0.918400 | 0.918446 |
700 | 0.145500 | 0.221471 | 0.924800 | 0.924822 |
750 | 0.148400 | 0.205013 | 0.924800 | 0.924764 |
800 | 0.148800 | 0.218529 | 0.927200 | 0.927047 |
850 | 0.099900 | 0.242431 | 0.912000 | 0.912135 |
900 | 0.098000 | 0.227289 | 0.923200 | 0.923078 |
950 | 0.119600 | 0.227527 | 0.921600 | 0.921664 |
1000 | 0.095400 | 0.241565 | 0.911200 | 0.911298 |
1050 | 0.119300 | 0.210961 | 0.926400 | 0.926326 |
1100 | 0.120700 | 0.220936 | 0.922400 | 0.922486 |
1150 | 0.087900 | 0.215886 | 0.931200 | 0.931104 |
1200 | 0.115600 | 0.226941 | 0.925600 | 0.925665 |
1250 | 0.111900 | 0.217936 | 0.927200 | 0.927205 |
1300 | 0.120600 | 0.213633 | 0.928800 | 0.928694 |
1350 | 0.127700 | 0.207188 | 0.928800 | 0.928805 |
1400 | 0.125600 | 0.207514 | 0.927200 | 0.927236 |
1450 | 0.129500 | 0.206957 | 0.928000 | 0.927953 |
1500 | 0.133400 | 0.206356 | 0.932000 | 0.931973 |
1550 | 0.143100 | 0.206889 | 0.925600 | 0.925637 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12777 MiB | 393723 GiB | 393722 GiB | | from large pool | 541440 KiB | 12712 MiB | 392259 GiB | 392258 GiB | | from small pool | 18003 KiB | 67 MiB | 1464 GiB | 1463 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12777 MiB | 393723 GiB | 393722 GiB | | from large pool | 541440 KiB | 12712 MiB | 392259 GiB | 392258 GiB | | from small pool | 18003 KiB | 67 MiB | 1464 GiB | 1463 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12774 MiB | 393356 GiB | 393355 GiB | | from large pool | 539228 KiB | 12708 MiB | 391899 GiB | 391899 GiB | | from small pool | 18002 KiB | 67 MiB | 1456 GiB | 1456 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 37750 MiB | 24692 MiB | | from large pool | 12988 MiB | 12988 MiB | 37556 MiB | 24568 MiB | | from small pool | 70 MiB | 70 MiB | 194 MiB | 124 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186029 KiB | 403075 KiB | 65659 GiB | 65659 GiB | | from large pool | 179456 KiB | 396800 KiB | 64082 GiB | 64082 GiB | | from small pool | 6573 KiB | 43066 KiB | 1576 GiB | 1576 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1343 | 36281 K | 36280 K | | from large pool | 80 | 334 | 10139 K | 10139 K | | from small pool | 780 | 1235 | 26142 K | 26141 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1343 | 36281 K | 36280 K | | from large pool | 80 | 334 | 10139 K | 10139 K | | from small pool | 780 | 1235 | 26142 K | 26141 K | |---------------------------------------------------------------------------| | GPU reserved segments | 263 | 266 | 740 | 477 | | from large pool | 228 | 231 | 643 | 415 | | from small pool | 35 | 35 | 97 | 62 | |---------------------------------------------------------------------------| | Non-releasable allocs | 39 | 91 | 16089 K | 16089 K | | from large pool | 19 | 25 | 1991 K | 1991 K | | from small pool | 20 | 73 | 14097 K | 14097 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.3126, |B|=0.2302, |∇A|=2.09e-05, |∇B|=1.922e-05, |LoRA(x)|=3.967e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.3092, |B|=0.2284, |∇A|=1.053e-05, |∇B|=1.543e-05, |LoRA(x)|=4.963e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2922, |B|=0.2073, |∇A|=1.761e-05, |∇B|=1.981e-05, |LoRA(x)|=4.697e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2717, |B|=0.2002, |∇A|=1.858e-05, |∇B|=3.249e-05, |LoRA(x)|=2.781e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.3042, |B|=0.2282, |∇A|=3.403e-05, |∇B|=1.861e-05, |LoRA(x)|=9.171e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2791, |B|=0.1975, |∇A|=9.822e-06, |∇B|=3.787e-05, |LoRA(x)|=5.548e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.303, |B|=0.2173, |∇A|=1.77e-05, |∇B|=1.789e-05, |LoRA(x)|=3.93e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2947, |B|=0.2204, |∇A|=1.252e-05, |∇B|=1.695e-05, |LoRA(x)|=4.102e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2841, |B|=0.1917, |∇A|=1.623e-05, |∇B|=2.607e-05, |LoRA(x)|=5.479e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2672, |B|=0.1968, |∇A|=1.62e-05, |∇B|=3.569e-05, |LoRA(x)|=3.751e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.3058, |B|=0.2265, |∇A|=3.013e-05, |∇B|=2.372e-05, |LoRA(x)|=1.013e+05, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2887, |B|=0.1928, |∇A|=1.907e-05, |∇B|=6.336e-05, |LoRA(x)|=2.348e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.3032, |B|=0.223, |∇A|=1.336e-05, |∇B|=1.902e-05, |LoRA(x)|=5.349e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.3088, |B|=0.23, |∇A|=1.665e-05, |∇B|=2.032e-05, |LoRA(x)|=5.098e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2658, |B|=0.1809, |∇A|=6.622e-06, |∇B|=1.578e-05, |LoRA(x)|=8.134e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.269, |B|=0.1936, |∇A|=2.082e-05, |∇B|=3.073e-05, |LoRA(x)|=2.732e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.3077, |B|=0.2274, |∇A|=2.392e-05, |∇B|=1.99e-05, |LoRA(x)|=1.387e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2787, |B|=0.1889, |∇A|=1.716e-05, |∇B|=5.25e-05, |LoRA(x)|=3.613e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.295, |B|=0.2245, |∇A|=1.476e-05, |∇B|=2.476e-05, |LoRA(x)|=4.581e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.3041, |B|=0.2277, |∇A|=2.366e-05, |∇B|=2.403e-05, |LoRA(x)|=5.223e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2668, |B|=0.175, |∇A|=1.119e-05, |∇B|=1.345e-05, |LoRA(x)|=4.834e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2866, |B|=0.1912, |∇A|=3.587e-05, |∇B|=2.687e-05, |LoRA(x)|=2.684e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2792, |B|=0.2092, |∇A|=2.456e-05, |∇B|=1.818e-05, |LoRA(x)|=1.005e+05, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.271, |B|=0.1839, |∇A|=1.121e-05, |∇B|=3.011e-05, |LoRA(x)|=5.787e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2787, |B|=0.2283, |∇A|=1.036e-05, |∇B|=2.179e-05, |LoRA(x)|=5.502e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2814, |B|=0.2195, |∇A|=3.045e-05, |∇B|=2.186e-05, |LoRA(x)|=4.356e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2593, |B|=0.177, |∇A|=8.482e-06, |∇B|=9.028e-06, |LoRA(x)|=5.856e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2776, |B|=0.1859, |∇A|=3.611e-05, |∇B|=2.187e-05, |LoRA(x)|=2.958e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2724, |B|=0.1996, |∇A|=1.66e-05, |∇B|=9.22e-06, |LoRA(x)|=1.139e+05, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2564, |B|=0.1726, |∇A|=2.607e-06, |∇B|=1.084e-05, |LoRA(x)|=1.182e+05, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.117, |B|=0.08833, |∇A|=2.397e-06, |∇B|=8.994e-06, |LoRA(x)|=5.911e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.09919, |B|=0.07128, |∇A|=1.035e-05, |∇B|=7.658e-06, |LoRA(x)|=2.261e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.1026, |B|=0.06929, |∇A|=7.872e-06, |∇B|=6.551e-06, |LoRA(x)|=1.372e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.1047, |B|=0.07159, |∇A|=9.197e-06, |∇B|=9.113e-06, |LoRA(x)|=1.067e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.1051, |B|=0.08229, |∇A|=2.444e-06, |∇B|=1.337e-06, |LoRA(x)|=9.152e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.1091, |B|=0.07372, |∇A|=1.452e-06, |∇B|=1.014e-05, |LoRA(x)|=6.988e+04, B≠0=12288
In [13]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.3130873441696167 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.2317005693912506 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.30914968252182007 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.23085884749889374 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.2924914062023163 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.20787815749645233 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2723824083805084 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.202121302485466 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3045843243598938 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2294681966304779 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.27814415097236633 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.1975916028022766 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.30333036184310913 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.2194133996963501 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2958250045776367 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.2223823368549347 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.28339284658432007 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.1932627409696579 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2665144205093384 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.19822382926940918 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.3056941032409668 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.22772006690502167 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2875228226184845 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.19381487369537354 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.30375099182128906 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.22436009347438812 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.31074726581573486 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.23241916298866272 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.26412829756736755 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.18227002024650574 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26882991194725037 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.19460999965667725 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.307273268699646 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.22827479243278503 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.27742069959640503 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.18766552209854126 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.296262264251709 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.2254493683576584 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.3027537763118744 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.23034703731536865 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2667587399482727 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.17558489739894867 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.286637544631958 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.19357585906982422 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.2796766459941864 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.21113112568855286 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2697647511959076 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.1837177276611328 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.27963075041770935 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.23049567639827728 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.28026485443115234 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.2212812602519989 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.25902503728866577 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.17878524959087372 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.27715545892715454 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.1859014481306076 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.27251890301704407 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.20201516151428223 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2590160369873047 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.17491042613983154 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.11481782048940659 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.08681395649909973 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.09903478622436523 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.07141385972499847 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.10359257459640503 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.07047466933727264 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.10529216378927231 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.07134665548801422 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.10542669147253036 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.08239973336458206 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.10737373679876328 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.07374820113182068 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 44.0741 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 32.3339 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 43.3585 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 32.1052 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 41.6555 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 29.3481 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.9750 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 28.2538 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 42.9607 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 64.0990 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 79.9170 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 27.7804 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 43.0380 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 30.6561 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 41.9185 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 31.1508 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 40.0310 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 27.2371 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.0273 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 27.7243 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 43.3030 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 63.6348 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 81.9168 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 27.1880 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 42.6822 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 31.3766 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 43.8474 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 32.5143 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 37.6679 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 25.1553 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 38.4100 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 27.2024 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 43.4614 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 64.1520 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 79.6359 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 26.5319 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 41.6899 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 31.5036 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 42.9434 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 32.1261 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 37.9713 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 24.7154 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 40.7833 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 27.3095 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 39.6625 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 59.4333 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 77.7465 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 26.2776 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 39.6635 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 32.3892 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 39.8031 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 31.1862 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 37.1783 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 24.9920 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 39.6877 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 26.4982 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 38.8166 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 57.1436 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 73.5394 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 24.6756 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 16.8938 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 12.4322 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 14.0184 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 10.2482 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 15.1539 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 9.7891 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 15.2001 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 10.2070 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 15.2559 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 23.3752 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 30.7747 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 10.5845 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.143415927886963 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.945195198059082 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.1160669326782227 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9384760856628418 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9423205852508545 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.8645776510238647 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8191121816635132 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.871808409690857 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.068324565887451 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9219297170639038 distilbert.transformer.layer.1.attention.k_lin.scale_out 2.0182478427886963 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9412158727645874 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.8924511671066284 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9078330993652344 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.8356657028198242 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.8385319709777832 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0020720958709717 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9397926330566406 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.004977226257324 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9489619731903076 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.8331149816513062 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.8606138229370117 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.7949244976043701 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.8371928930282593 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0343732833862305 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9375890493392944 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.086409091949463 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9043607711791992 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.8417439460754395 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.8316080570220947 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.8121240139007568 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.8697593212127686 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.020796775817871 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.8351774215698242 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.0617518424987793 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.8887094259262085 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8100652694702148 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.8032965660095215 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.8582032918930054 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.7846527099609375 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.8675384521484375 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.8863449096679688 distilbert.transformer.layer.5.attention.k_lin.scale_out 1.9487051963806152 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9130948781967163 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.7303485870361328 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.8914060592651367 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.6965138912200928 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.946123480796814 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 60.5458 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.7531 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 59.8328 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.6071 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.9024 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 52.8202 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 51.4561 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 52.7825 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 58.3508 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 54.2026 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 57.0470 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.5709 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.3235 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.6935 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 51.8552 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 51.8757 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.4675 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.5555 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 56.7332 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.8873 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 51.5131 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 52.4116 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 50.7663 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 51.9165 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 57.3369 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4475 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.7842 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 53.6936 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 51.8932 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 51.6082 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 51.2666 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 52.7824 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.2494 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 51.8942 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.1691 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 53.0914 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.3630 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 50.9851 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 52.6959 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 50.7288 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 53.0831 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 52.8647 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 54.5827 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.4631 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 49.1248 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.0848 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 48.4228 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.4398 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.4758349359035492 distilbert.transformer.layer.0.attention.q_lin.m_in 0.3171270489692688 distilbert.transformer.layer.0.attention.k_lin.m_out 0.45653432607650757 distilbert.transformer.layer.0.attention.k_lin.m_in 0.30572620034217834 distilbert.transformer.layer.0.attention.v_lin.m_out 0.33737608790397644 distilbert.transformer.layer.0.attention.v_lin.m_in 0.27412110567092896 distilbert.transformer.layer.0.attention.out_lin.m_out 0.25334274768829346 distilbert.transformer.layer.0.attention.out_lin.m_in 0.2535669207572937 distilbert.transformer.layer.1.attention.q_lin.m_out 0.4048326015472412 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2937561571598053 distilbert.transformer.layer.1.attention.k_lin.m_out 0.39514288306236267 distilbert.transformer.layer.1.attention.k_lin.m_in 0.2929022014141083 distilbert.transformer.layer.1.attention.v_lin.m_out 0.2782657742500305 distilbert.transformer.layer.1.attention.v_lin.m_in 0.277413547039032 distilbert.transformer.layer.1.attention.out_lin.m_out 0.26822805404663086 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23911237716674805 distilbert.transformer.layer.2.attention.q_lin.m_out 0.3618754744529724 distilbert.transformer.layer.2.attention.q_lin.m_in 0.29656681418418884 distilbert.transformer.layer.2.attention.k_lin.m_out 0.38085007667541504 distilbert.transformer.layer.2.attention.k_lin.m_in 0.32065433263778687 distilbert.transformer.layer.2.attention.v_lin.m_out 0.2376464307308197 distilbert.transformer.layer.2.attention.v_lin.m_in 0.2442511022090912 distilbert.transformer.layer.2.attention.out_lin.m_out 0.24287188053131104 distilbert.transformer.layer.2.attention.out_lin.m_in 0.24134962260723114 distilbert.transformer.layer.3.attention.q_lin.m_out 0.38565611839294434 distilbert.transformer.layer.3.attention.q_lin.m_in 0.29446443915367126 distilbert.transformer.layer.3.attention.k_lin.m_out 0.41610729694366455 distilbert.transformer.layer.3.attention.k_lin.m_in 0.2909688949584961 distilbert.transformer.layer.3.attention.v_lin.m_out 0.24268102645874023 distilbert.transformer.layer.3.attention.v_lin.m_in 0.22880792617797852 distilbert.transformer.layer.3.attention.out_lin.m_out 0.24307577311992645 distilbert.transformer.layer.3.attention.out_lin.m_in 0.2710449695587158 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3831055164337158 distilbert.transformer.layer.4.attention.q_lin.m_in 0.24855349957942963 distilbert.transformer.layer.4.attention.k_lin.m_out 0.4071647524833679 distilbert.transformer.layer.4.attention.k_lin.m_in 0.2669607102870941 distilbert.transformer.layer.4.attention.v_lin.m_out 0.2469301074743271 distilbert.transformer.layer.4.attention.v_lin.m_in 0.22498857975006104 distilbert.transformer.layer.4.attention.out_lin.m_out 0.27932173013687134 distilbert.transformer.layer.4.attention.out_lin.m_in 0.23366175591945648 distilbert.transformer.layer.5.attention.q_lin.m_out 0.29120978713035583 distilbert.transformer.layer.5.attention.q_lin.m_in 0.23429414629936218 distilbert.transformer.layer.5.attention.k_lin.m_out 0.2640606164932251 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23716507852077484 distilbert.transformer.layer.5.attention.v_lin.m_out 0.17384551465511322 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22829030454158783 distilbert.transformer.layer.5.attention.out_lin.m_out 0.1812361180782318 distilbert.transformer.layer.5.attention.out_lin.m_in 0.24070218205451965 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 15.9995 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 11.2283 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 15.4525 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.9587 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 11.8576 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.4428 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.4034 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.9603 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 13.9474 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.7618 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 13.7240 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 10.5466 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 10.1964 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 10.3101 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 9.9476 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.5549 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 12.6874 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 10.6759 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 13.4066 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 11.3571 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 9.0099 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.8738 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 8.8574 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.6634 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 13.2470 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 10.6238 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 14.1565 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.7714 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.0560 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2100 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.0960 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 10.6091 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 13.7025 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.6629 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.8522 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.8785 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.9477 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.3498 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.1896 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.6145 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.0281 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 8.8945 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 9.4590 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.8465 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 7.0202 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.3106 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.4953 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 9.0249
In [14]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=100,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=3e-3, ###########
lr_B_scale=0.5, #############
lr_scale_params=0.75, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 3:55:58, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.092800 | 0.217399 | 0.927200 | 0.927236 |
100 | 0.084100 | 0.238997 | 0.920000 | 0.920066 |
150 | 0.126400 | 0.216239 | 0.923200 | 0.923281 |
200 | 0.053400 | 0.236535 | 0.930400 | 0.930336 |
250 | 0.067600 | 0.236453 | 0.928800 | 0.928805 |
300 | 0.071100 | 0.234183 | 0.924000 | 0.924067 |
350 | 0.072800 | 0.249843 | 0.913600 | 0.913759 |
400 | 0.039700 | 0.277594 | 0.926400 | 0.926411 |
450 | 0.055000 | 0.288970 | 0.915200 | 0.915323 |
500 | 0.056200 | 0.260812 | 0.924800 | 0.924871 |
550 | 0.041600 | 0.277263 | 0.930400 | 0.930207 |
600 | 0.065800 | 0.261826 | 0.928000 | 0.928000 |
650 | 0.142700 | 0.220620 | 0.924000 | 0.924017 |
700 | 0.141100 | 0.213520 | 0.924000 | 0.924006 |
750 | 0.132400 | 0.214614 | 0.928000 | 0.927989 |
800 | 0.137800 | 0.215015 | 0.919200 | 0.919280 |
850 | 0.081300 | 0.228305 | 0.920000 | 0.920066 |
900 | 0.083900 | 0.225849 | 0.929600 | 0.929529 |
950 | 0.094200 | 0.228597 | 0.924800 | 0.924751 |
1000 | 0.077600 | 0.230933 | 0.924800 | 0.924822 |
1050 | 0.095400 | 0.225562 | 0.924800 | 0.924724 |
1100 | 0.092100 | 0.225185 | 0.927200 | 0.927194 |
1150 | 0.069200 | 0.227096 | 0.926400 | 0.926377 |
1200 | 0.096100 | 0.228098 | 0.925600 | 0.925570 |
1250 | 0.085600 | 0.227183 | 0.928000 | 0.927965 |
1300 | 0.095200 | 0.226917 | 0.928000 | 0.927989 |
1350 | 0.098800 | 0.226520 | 0.929600 | 0.929554 |
1400 | 0.105300 | 0.225165 | 0.926400 | 0.926377 |
1450 | 0.110700 | 0.227184 | 0.924800 | 0.924788 |
1500 | 0.117300 | 0.224868 | 0.928000 | 0.927965 |
1550 | 0.125300 | 0.225693 | 0.927200 | 0.927159 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12777 MiB | 524962 GiB | 524962 GiB | | from large pool | 541440 KiB | 12712 MiB | 523010 GiB | 523010 GiB | | from small pool | 18003 KiB | 67 MiB | 1952 GiB | 1951 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12777 MiB | 524962 GiB | 524962 GiB | | from large pool | 541440 KiB | 12712 MiB | 523010 GiB | 523010 GiB | | from small pool | 18003 KiB | 67 MiB | 1952 GiB | 1951 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12774 MiB | 524473 GiB | 524473 GiB | | from large pool | 539228 KiB | 12708 MiB | 522531 GiB | 522530 GiB | | from small pool | 18002 KiB | 67 MiB | 1942 GiB | 1942 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 50096 MiB | 37038 MiB | | from large pool | 12988 MiB | 12988 MiB | 49840 MiB | 36852 MiB | | from small pool | 70 MiB | 70 MiB | 256 MiB | 186 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186029 KiB | 403075 KiB | 87871 GiB | 87871 GiB | | from large pool | 179456 KiB | 396800 KiB | 85768 GiB | 85768 GiB | | from small pool | 6573 KiB | 43066 KiB | 2103 GiB | 2103 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1343 | 48373 K | 48372 K | | from large pool | 80 | 334 | 13518 K | 13518 K | | from small pool | 780 | 1235 | 34854 K | 34853 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1343 | 48373 K | 48372 K | | from large pool | 80 | 334 | 13518 K | 13518 K | | from small pool | 780 | 1235 | 34854 K | 34853 K | |---------------------------------------------------------------------------| | GPU reserved segments | 263 | 266 | 977 | 714 | | from large pool | 228 | 231 | 849 | 621 | | from small pool | 35 | 35 | 128 | 93 | |---------------------------------------------------------------------------| | Non-releasable allocs | 39 | 91 | 21495 K | 21495 K | | from large pool | 19 | 25 | 2661 K | 2661 K | | from small pool | 20 | 73 | 18833 K | 18833 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.3191, |B|=0.2332, |∇A|=2.561e-05, |∇B|=2.274e-05, |LoRA(x)|=3.903e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.3156, |B|=0.2323, |∇A|=1.262e-05, |∇B|=1.765e-05, |LoRA(x)|=4.786e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2983, |B|=0.2095, |∇A|=1.981e-05, |∇B|=2.236e-05, |LoRA(x)|=4.594e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2763, |B|=0.2038, |∇A|=2.055e-05, |∇B|=3.635e-05, |LoRA(x)|=2.653e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.3098, |B|=0.231, |∇A|=3.863e-05, |∇B|=2.089e-05, |LoRA(x)|=8.88e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2825, |B|=0.1993, |∇A|=1.185e-05, |∇B|=4.406e-05, |LoRA(x)|=4.787e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.3094, |B|=0.2208, |∇A|=2.05e-05, |∇B|=2.032e-05, |LoRA(x)|=3.828e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.3013, |B|=0.2237, |∇A|=1.563e-05, |∇B|=2.026e-05, |LoRA(x)|=3.889e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2896, |B|=0.195, |∇A|=1.914e-05, |∇B|=3.137e-05, |LoRA(x)|=5.235e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2709, |B|=0.2001, |∇A|=1.835e-05, |∇B|=4.02e-05, |LoRA(x)|=3.506e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.3109, |B|=0.2294, |∇A|=3.389e-05, |∇B|=2.665e-05, |LoRA(x)|=9.808e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2927, |B|=0.1955, |∇A|=2.241e-05, |∇B|=7.346e-05, |LoRA(x)|=2.2e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.3103, |B|=0.2256, |∇A|=1.535e-05, |∇B|=2.131e-05, |LoRA(x)|=5.076e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.3154, |B|=0.2339, |∇A|=1.946e-05, |∇B|=2.336e-05, |LoRA(x)|=5.059e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2673, |B|=0.1835, |∇A|=6.496e-06, |∇B|=1.598e-05, |LoRA(x)|=8.126e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2741, |B|=0.1963, |∇A|=2.332e-05, |∇B|=3.492e-05, |LoRA(x)|=2.586e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.3142, |B|=0.2299, |∇A|=2.917e-05, |∇B|=2.307e-05, |LoRA(x)|=1.238e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.282, |B|=0.1896, |∇A|=2.101e-05, |∇B|=6.349e-05, |LoRA(x)|=3.022e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.3024, |B|=0.2267, |∇A|=1.685e-05, |∇B|=2.754e-05, |LoRA(x)|=4.602e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.3085, |B|=0.2321, |∇A|=2.751e-05, |∇B|=2.773e-05, |LoRA(x)|=5.096e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2715, |B|=0.1777, |∇A|=1.168e-05, |∇B|=1.535e-05, |LoRA(x)|=4.822e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.292, |B|=0.1955, |∇A|=3.474e-05, |∇B|=2.617e-05, |LoRA(x)|=2.793e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2855, |B|=0.2127, |∇A|=2.62e-05, |∇B|=1.941e-05, |LoRA(x)|=9.911e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2742, |B|=0.1865, |∇A|=1.253e-05, |∇B|=3.196e-05, |LoRA(x)|=5.215e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.285, |B|=0.232, |∇A|=1.148e-05, |∇B|=2.4e-05, |LoRA(x)|=5.61e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.286, |B|=0.2225, |∇A|=3.431e-05, |∇B|=2.507e-05, |LoRA(x)|=4.25e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2643, |B|=0.1806, |∇A|=8.402e-06, |∇B|=9.681e-06, |LoRA(x)|=5.6e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2811, |B|=0.1879, |∇A|=3.767e-05, |∇B|=2.109e-05, |LoRA(x)|=2.829e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2782, |B|=0.2039, |∇A|=1.628e-05, |∇B|=1.005e-05, |LoRA(x)|=1.154e+05, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.261, |B|=0.1764, |∇A|=3.015e-06, |∇B|=1.282e-05, |LoRA(x)|=1.131e+05, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.1344, |B|=0.09149, |∇A|=4.199e-06, |∇B|=2.195e-05, |LoRA(x)|=4.091e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.1097, |B|=0.07668, |∇A|=7.525e-06, |∇B|=6.607e-06, |LoRA(x)|=2.306e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.112, |B|=0.07693, |∇A|=4.317e-06, |∇B|=4.99e-06, |LoRA(x)|=1.804e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.1169, |B|=0.07649, |∇A|=1.616e-05, |∇B|=1.189e-05, |LoRA(x)|=8248, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.1161, |B|=0.08647, |∇A|=2.731e-06, |∇B|=2.285e-06, |LoRA(x)|=8.391e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.1167, |B|=0.07743, |∇A|=9.559e-07, |∇B|=7.835e-06, |LoRA(x)|=7.038e+04, B≠0=12288
In [15]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.32006871700286865 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.23343026638031006 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.3163188099861145 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.23248806595802307 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.29897835850715637 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.20968830585479736 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2769838273525238 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.20406422019004822 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3101019859313965 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2311478555202484 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2829970121383667 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.19944843649864197 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.30994346737861633 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.22095946967601776 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.30191946029663086 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.22382184863090515 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2906057834625244 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.19542376697063446 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2710033655166626 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.20037208497524261 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.31169062852859497 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.22961382567882538 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.293379008769989 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.19578644633293152 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.3109404742717743 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.22571133077144623 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.31563350558280945 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.234167218208313 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.2676677107810974 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.18374203145503998 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.27449172735214233 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.19648432731628418 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.3144126534461975 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.23003749549388885 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.28201884031295776 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.18987713754177094 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.3032500147819519 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.2268686294555664 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.3090388774871826 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.23231551051139832 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2715294361114502 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.17792069911956787 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.29230600595474243 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.19575712084770203 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.2863031029701233 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.21289019286632538 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2745441794395447 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.18678216636180878 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.28569456934928894 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.23207253217697144 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.28643906116485596 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.2226107120513916 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.26507601141929626 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.18089240789413452 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.28175055980682373 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.1881856918334961 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.278497576713562 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.20418387651443481 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.261717826128006 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.1765369176864624 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.14042069017887115 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.09242694079875946 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.1104985922574997 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.07718655467033386 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.1133069396018982 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.07779480516910553 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.11800543963909149 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.0768902599811554 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.11741181463003159 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.08706154674291611 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.11588611453771591 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.07698755711317062 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 44.9870 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 32.5584 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 44.3253 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 32.3237 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 42.5150 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 29.6270 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 39.6143 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 28.5344 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.8038 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 64.5621 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 81.4011 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 28.0506 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 43.9143 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 30.9039 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 42.7763 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 31.3954 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 40.9525 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 27.5261 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.7577 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 28.0088 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 44.1707 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 64.1219 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 83.6082 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 27.4695 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 43.6299 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 31.6104 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 44.5886 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 32.7635 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 38.2511 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 25.4722 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 39.2425 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 27.5227 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 44.3785 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 64.6458 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 81.0003 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 26.8458 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 42.6667 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 31.7096 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 43.7228 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 32.3791 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 38.6395 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 25.0596 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 41.5452 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 27.6174 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 40.5608 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 59.9759 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 79.1136 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 26.6528 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 40.4837 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 32.6220 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 40.6622 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 31.4454 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 37.9944 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 25.3504 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 40.3440 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 26.7918 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 39.6790 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 57.8006 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 74.4113 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 24.9702 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 20.0911 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 13.2669 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 15.6004 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 11.0380 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 16.5866 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 10.7600 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 16.9635 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 11.1312 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 16.8478 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 24.8647 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 33.2164 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 11.1452 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.1905102729797363 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9499949216842651 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.1458122730255127 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9455236196517944 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9724862575531006 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.860857367515564 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8391855955123901 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.8599152565002441 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.1013097763061523 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9234527349472046 distilbert.transformer.layer.1.attention.k_lin.scale_out 2.0495128631591797 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9428495168685913 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9081525802612305 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9107887744903564 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.8454234600067139 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.830349326133728 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.021239757537842 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9438060522079468 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0388073921203613 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9477781057357788 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.8382298946380615 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.852615237236023 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.801897406578064 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.8316043615341187 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0587120056152344 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9431657791137695 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.1159026622772217 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9063804149627686 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.8447027206420898 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.825028419494629 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.8210327625274658 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.8625237941741943 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0286831855773926 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.8318060636520386 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.090421199798584 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.8871725797653198 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8073701858520508 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.7961653470993042 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.860413908958435 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.7698571681976318 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.8676936626434326 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9185214042663574 distilbert.transformer.layer.5.attention.k_lin.scale_out 1.9522435665130615 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9019559621810913 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.706939935684204 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.8740447759628296 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.685848593711853 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9280996322631836 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 61.9183 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9021 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 60.7254 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8114 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 55.8152 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 52.8003 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 52.1086 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 52.5454 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 59.3379 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 54.2897 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 57.9845 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.6535 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.8677 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8049 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 52.2140 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 51.7252 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 57.0675 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6921 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.7274 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.8940 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 51.7424 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 52.2546 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 51.0737 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 51.8399 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 58.0506 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.6317 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 59.6763 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 53.7666 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 52.0499 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 51.4959 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 51.6481 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 52.6626 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5260 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 51.8521 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 59.0424 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 53.0973 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.3750 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 50.8595 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 52.8702 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 50.4516 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 53.1685 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.7801 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 54.7189 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.2050 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 48.6783 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 52.6801 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 48.4120 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.0150 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.5408788919448853 distilbert.transformer.layer.0.attention.q_lin.m_in 0.33161652088165283 distilbert.transformer.layer.0.attention.k_lin.m_out 0.4993875026702881 distilbert.transformer.layer.0.attention.k_lin.m_in 0.32487034797668457 distilbert.transformer.layer.0.attention.v_lin.m_out 0.38412022590637207 distilbert.transformer.layer.0.attention.v_lin.m_in 0.2862411141395569 distilbert.transformer.layer.0.attention.out_lin.m_out 0.2917907238006592 distilbert.transformer.layer.0.attention.out_lin.m_in 0.25911882519721985 distilbert.transformer.layer.1.attention.q_lin.m_out 0.4537653923034668 distilbert.transformer.layer.1.attention.q_lin.m_in 0.3067074120044708 distilbert.transformer.layer.1.attention.k_lin.m_out 0.441150039434433 distilbert.transformer.layer.1.attention.k_lin.m_in 0.305277556180954 distilbert.transformer.layer.1.attention.v_lin.m_out 0.3112054765224457 distilbert.transformer.layer.1.attention.v_lin.m_in 0.29263025522232056 distilbert.transformer.layer.1.attention.out_lin.m_out 0.29383260011672974 distilbert.transformer.layer.1.attention.out_lin.m_in 0.24737994372844696 distilbert.transformer.layer.2.attention.q_lin.m_out 0.39384183287620544 distilbert.transformer.layer.2.attention.q_lin.m_in 0.31080108880996704 distilbert.transformer.layer.2.attention.k_lin.m_out 0.4301289916038513 distilbert.transformer.layer.2.attention.k_lin.m_in 0.32837507128715515 distilbert.transformer.layer.2.attention.v_lin.m_out 0.25836628675460815 distilbert.transformer.layer.2.attention.v_lin.m_in 0.2523249387741089 distilbert.transformer.layer.2.attention.out_lin.m_out 0.26635345816612244 distilbert.transformer.layer.2.attention.out_lin.m_in 0.252669095993042 distilbert.transformer.layer.3.attention.q_lin.m_out 0.4221906363964081 distilbert.transformer.layer.3.attention.q_lin.m_in 0.3118211030960083 distilbert.transformer.layer.3.attention.k_lin.m_out 0.4609851837158203 distilbert.transformer.layer.3.attention.k_lin.m_in 0.30377718806266785 distilbert.transformer.layer.3.attention.v_lin.m_out 0.26118120551109314 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23761092126369476 distilbert.transformer.layer.3.attention.out_lin.m_out 0.27069878578186035 distilbert.transformer.layer.3.attention.out_lin.m_in 0.28009089827537537 distilbert.transformer.layer.4.attention.q_lin.m_out 0.40237540006637573 distilbert.transformer.layer.4.attention.q_lin.m_in 0.2583754360675812 distilbert.transformer.layer.4.attention.k_lin.m_out 0.4506079852581024 distilbert.transformer.layer.4.attention.k_lin.m_in 0.27900615334510803 distilbert.transformer.layer.4.attention.v_lin.m_out 0.259660005569458 distilbert.transformer.layer.4.attention.v_lin.m_in 0.23736310005187988 distilbert.transformer.layer.4.attention.out_lin.m_out 0.298389196395874 distilbert.transformer.layer.4.attention.out_lin.m_in 0.24150829017162323 distilbert.transformer.layer.5.attention.q_lin.m_out 0.30278030037879944 distilbert.transformer.layer.5.attention.q_lin.m_in 0.2885325849056244 distilbert.transformer.layer.5.attention.k_lin.m_out 0.27983635663986206 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23877453804016113 distilbert.transformer.layer.5.attention.v_lin.m_out 0.17999951541423798 distilbert.transformer.layer.5.attention.v_lin.m_in 0.2295137643814087 distilbert.transformer.layer.5.attention.out_lin.m_out 0.20580878853797913 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23703013360500336 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 17.8224 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 11.5302 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 16.7042 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 11.2798 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 13.1700 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.7177 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 10.5116 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 10.1391 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 15.3281 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 11.0318 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 15.0336 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 10.8379 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 11.2211 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 10.6288 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 10.6918 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.7514 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 13.6758 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 10.9967 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 14.7531 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 11.5981 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 9.6700 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 10.0536 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.6048 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.9626 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 14.2182 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 10.9856 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 15.4504 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 11.0052 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.5728 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.4388 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.9672 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 10.8309 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 14.2122 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.8723 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 15.0903 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 10.1352 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 9.2822 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.5726 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.7200 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.8551 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.4841 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 10.0902 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 9.8908 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.0085 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 7.3518 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.4096 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 8.2823 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 9.1136
In [ ]: