In [1]:
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print(torch.cuda.get_device_name(0))
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
import bitsandbytes
import peft
import transformers
print(transformers.__version__)
print(f"bitsandbytes version: {bitsandbytes.__version__}")
print(f"peft version: {peft.__version__}")
print(torch.cuda.is_bf16_supported())
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)] PyTorch Version: 2.5.1+cu121 True 1 CUDA Version: 12.1 NVIDIA GeForce RTX 4080 Laptop GPU 4.50.0.dev0 bitsandbytes version: 0.45.3 peft version: 0.15.2.dev0 True
In [2]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
imdb_dataset = imdb_dataset.rename_column("label", "labels")
# Split the test set into validation and test sets
test_val_split = imdb_dataset['test'].train_test_split(test_size=0.95, seed=42)
imdb_dataset['validation'] = test_val_split['train']
imdb_dataset['test'] = test_val_split['test']
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
# Determine the number of labels
num_labels = len(set(imdb_dataset["train"]["labels"]))
print(f"Number of labels: {num_labels}")
# Load the tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# Tokenize the whole dataset, truncate to 384 tokens
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, max_length=384)
dataset_encoded = imdb_dataset.map(tokenize, batched=True, batch_size=None)
# Load the pretrained model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))
#print(model)
Number of labels: 2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [3]:
# Helper functions
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
def count_trainable_parameters(model):
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total_params, trainable_params, 100 * trainable_params / total_params
def freeze_model_layers(model, unfreeze_pre_classifier=False):
# Freeze all parameters
for param in model.parameters():
param.requires_grad = False
# Unfreeze LoRA and DoRA-specific params, including lora_norm
for name, param in model.named_parameters():
if (
"lora.A" in name
or "lora.B" in name
or "lora_norm" in name
or name.endswith(".m") # For DoRA
or name.endswith(".m_in") # For DDoRA
or name.endswith(".m_out") # For DDoRA
or "scale" in name
):
param.requires_grad = True
# Unfreeze classifier layer (always)
for name, param in model.named_parameters():
if name.startswith("classifier."):
param.requires_grad = True
# unfreeze pre-classifier
if unfreeze_pre_classifier:
for name, param in model.named_parameters():
if name.startswith("pre_classifier."):
param.requires_grad = True
In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)
class LoRALayer(nn.Module):
def __init__(self, in_dim, out_dim, rank, alpha, dropout_rate=0.0):
super().__init__()
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
self.B = nn.Parameter(1e-4 * torch.randn(rank, out_dim) * std_dev) # not all zeroes!
self.alpha = alpha
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
dropped = self.dropout(x @ self.A)
return self.alpha * (dropped @ self.B)
class LinearWithDoubleDoRA(nn.Module):
def __init__(self, linear, rank, alpha, scaling_factor=1.0):
super().__init__()
self.linear = linear
self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.m_out = nn.Parameter(torch.randn(1, linear.out_features) * std_dev)
self.m_in = nn.Parameter(torch.randn(linear.in_features, 1) * std_dev)
# Orthogonal initialization for m_out
#self.m_out = nn.Parameter(torch.empty(1, linear.out_features))
#nn.init.orthogonal_(self.m_out)
# Orthogonal initialization for m_in
#self.m_in = nn.Parameter(torch.empty(linear.in_features, 1))
#nn.init.orthogonal_(self.m_in)
self.scale_out = nn.Parameter(torch.full((1, linear.out_features), float(scaling_factor)))
self.scale_in = nn.Parameter(torch.full((linear.in_features, 1), float(scaling_factor)))
self.last_lora_output_norm = 0.0 # For monitoring
def forward(self, x):
scaled_x = x * self.scale_in.T * self.m_in.T
linear_output = self.linear(x)
lora_output = self.lora(scaled_x)
lora_output = F.dropout(lora_output, p=0.05) #prevents overfitting to small artifacts, spreads useful signal across more of the low-rank space
lora_output_norm = lora_output / (lora_output.norm(p=2, dim=1, keepdim=True) + 1e-9)
self.last_lora_output_norm = lora_output.norm(p=2, dim=-1).mean().item()
dora_modification = self.scale_out * self.m_out * lora_output_norm
return linear_output + dora_modification
def inject_ddora_all_attn(model, rank, alpha, scaling_factor=1.0, dropout_rate=0.0, disable_layers=None):
target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin", "ffn.lin1", "ffn.lin2"]
#target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"]
if disable_layers is None:
disable_layers = []
for name, module in model.named_modules():
if isinstance(module, nn.Linear) and any(layer in name for layer in target_layers):
# Try to extract layer index from names like "transformer.layer.4.attention.q_lin"
parts = name.split('.')
layer_idx = None
for i, part in enumerate(parts):
if part == "layer" and i + 1 < len(parts):
try:
layer_idx = int(parts[i + 1])
break
except ValueError:
pass
if layer_idx is not None and layer_idx in disable_layers:
continue
parent_name = name.rsplit('.', 1)[0]
parent_module = model.get_submodule(parent_name)
original_linear = getattr(parent_module, name.split('.')[-1])
ddora_layer = LinearWithDoubleDoRA(original_linear, rank, alpha, scaling_factor)
ddora_layer.lora.dropout = nn.Dropout(dropout_rate)
setattr(parent_module, name.split('.')[-1], ddora_layer)
return model
In [5]:
def monitor_lora_parameters(model, threshold=1e-7):
monitor = {
"A_abs_mean": [],
"B_abs_mean": [],
"A_grad_mean": [],
"B_grad_mean": [],
"lora_output_norm": [],
"B_nonzero_count": [],
}
hooks = []
for name, module in model.named_modules():
if hasattr(module, "lora") and hasattr(module.lora, "A") and hasattr(module.lora, "B"):
A_param = module.lora.A
B_param = module.lora.B
# Gradient hooks (directly on nn.Parameter)
if A_param.requires_grad:
hooks.append(A_param.register_hook(lambda grad, n=name: monitor["A_grad_mean"].append((n, grad.abs().mean().item()))))
if B_param.requires_grad:
hooks.append(B_param.register_hook(lambda grad, n=name: monitor["B_grad_mean"].append((n, grad.abs().mean().item()))))
# Forward hook for value stats
def forward_hook(mod, inp, out, n=name):
A_mean = mod.lora.A.abs().mean().item()
B_mean = mod.lora.B.abs().mean().item()
B_nnz = (mod.lora.B.abs() > threshold).sum().item()
monitor["A_abs_mean"].append((n, A_mean))
monitor["B_abs_mean"].append((n, B_mean))
monitor["B_nonzero_count"].append((n, B_nnz))
monitor["lora_output_norm"].append((n, mod.last_lora_output_norm))
hooks.append(module.register_forward_hook(forward_hook))
return hooks, monitor
from transformers import TrainingArguments
def monitor_gradients(model):
hooks = []
gradient_history = {}
for name, param in model.named_parameters():
if param.requires_grad:
gradient_history[name] = []
def get_hook(n): # capture the name immediately
def hook(grad):
gradient_history[n].append(grad.abs().mean().item())
return hook
hooks.append(param.register_hook(get_hook(name)))
return hooks, gradient_history
In [6]:
learning_rate = 1e-2 #############
dropout = 0.3 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
output_dir_prefix = "finetuned-imdb-"
import copy
torch.manual_seed(137)
model_ddora_all_attn = copy.deepcopy(model)
model_ddora_all_attn = inject_ddora_all_attn(model_ddora_all_attn, lora_rank, lora_alpha, scaling_factor, dropout)
freeze_model_layers(model_ddora_all_attn, unfreeze_pre_classifier=True)
total_params_ddora, trainable_params_ddora, percentage_ddora = count_trainable_parameters(model_ddora_all_attn)
print(f"\nDDoRA (All Attention) - Total parameters: {total_params_ddora:,}")
print(f"DDoRA (All Attention) - Trainable parameters: {trainable_params_ddora:,} ({percentage_ddora:.2f}%)")
# Sanity check
#print("\nTrainable parameters after freezing:")
#for name, param in model_ddora_all_attn.named_parameters():
# if param.requires_grad:
# print(name)
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=3,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0, #####
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
DDoRA (All Attention) - Total parameters: 68,448,002 DDoRA (All Attention) - Trainable parameters: 2,085,122 (3.05%)
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[2346/2346 4:14:13, Epoch 3/3]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.601500 | 0.292667 | 0.880000 | 0.880255 |
100 | 0.330500 | 0.289936 | 0.878400 | 0.878587 |
150 | 0.333400 | 0.285789 | 0.880800 | 0.881023 |
200 | 0.268200 | 0.264306 | 0.894400 | 0.894553 |
250 | 0.298500 | 0.240361 | 0.902400 | 0.902442 |
300 | 0.275700 | 0.234229 | 0.906400 | 0.906162 |
350 | 0.270200 | 0.235252 | 0.908000 | 0.907978 |
400 | 0.243100 | 0.264926 | 0.902400 | 0.902558 |
450 | 0.250700 | 0.300190 | 0.896000 | 0.894957 |
500 | 0.281500 | 0.248224 | 0.909600 | 0.909429 |
550 | 0.247700 | 0.252260 | 0.908000 | 0.907657 |
600 | 0.269500 | 0.243112 | 0.911200 | 0.910913 |
650 | 0.237300 | 0.225605 | 0.912000 | 0.911927 |
700 | 0.233200 | 0.235947 | 0.911200 | 0.910974 |
750 | 0.246900 | 0.296594 | 0.901600 | 0.900834 |
800 | 0.247200 | 0.267171 | 0.903200 | 0.902447 |
850 | 0.223600 | 0.224062 | 0.911200 | 0.911207 |
900 | 0.206000 | 0.235725 | 0.914400 | 0.914495 |
950 | 0.233100 | 0.234277 | 0.919200 | 0.919140 |
1000 | 0.203900 | 0.212509 | 0.919200 | 0.919154 |
1050 | 0.207700 | 0.219393 | 0.924000 | 0.923772 |
1100 | 0.202600 | 0.227999 | 0.924800 | 0.924583 |
1150 | 0.192400 | 0.203774 | 0.922400 | 0.922417 |
1200 | 0.202600 | 0.218544 | 0.919200 | 0.919271 |
1250 | 0.197800 | 0.211557 | 0.926400 | 0.926253 |
1300 | 0.212400 | 0.205286 | 0.920000 | 0.920093 |
1350 | 0.213300 | 0.200498 | 0.924000 | 0.924067 |
1400 | 0.205300 | 0.203650 | 0.928000 | 0.927953 |
1450 | 0.198100 | 0.205225 | 0.918400 | 0.918446 |
1500 | 0.200500 | 0.195481 | 0.920000 | 0.919873 |
1550 | 0.198500 | 0.207493 | 0.925600 | 0.925617 |
1600 | 0.167500 | 0.255866 | 0.918400 | 0.918495 |
1650 | 0.158800 | 0.212292 | 0.924000 | 0.923970 |
1700 | 0.168400 | 0.229653 | 0.914400 | 0.914528 |
1750 | 0.161200 | 0.244941 | 0.922400 | 0.922314 |
1800 | 0.169800 | 0.212388 | 0.924800 | 0.924738 |
1850 | 0.174600 | 0.217708 | 0.921600 | 0.921634 |
1900 | 0.146000 | 0.218510 | 0.923200 | 0.923200 |
1950 | 0.174400 | 0.204068 | 0.927200 | 0.927120 |
2000 | 0.163500 | 0.209466 | 0.913600 | 0.913740 |
2050 | 0.162600 | 0.200313 | 0.926400 | 0.926451 |
2100 | 0.169100 | 0.197281 | 0.928000 | 0.928021 |
2150 | 0.117800 | 0.217998 | 0.922400 | 0.922468 |
2200 | 0.153300 | 0.205229 | 0.931200 | 0.931131 |
2250 | 0.173100 | 0.197001 | 0.928000 | 0.927953 |
2300 | 0.157300 | 0.195606 | 0.927200 | 0.927194 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 564070 KiB | 12777 MiB | 196479 GiB | 196478 GiB | | from large pool | 546048 KiB | 12712 MiB | 195749 GiB | 195748 GiB | | from small pool | 18022 KiB | 67 MiB | 730 GiB | 730 GiB | |---------------------------------------------------------------------------| | Active memory | 564070 KiB | 12777 MiB | 196479 GiB | 196478 GiB | | from large pool | 546048 KiB | 12712 MiB | 195749 GiB | 195748 GiB | | from small pool | 18022 KiB | 67 MiB | 730 GiB | 730 GiB | |---------------------------------------------------------------------------| | Requested memory | 561856 KiB | 12774 MiB | 196296 GiB | 196296 GiB | | from large pool | 543836 KiB | 12708 MiB | 195570 GiB | 195569 GiB | | from small pool | 18020 KiB | 67 MiB | 726 GiB | 726 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 13058 MiB | 0 B | | from large pool | 12988 MiB | 12988 MiB | 12988 MiB | 0 B | | from small pool | 70 MiB | 70 MiB | 70 MiB | 0 B | |---------------------------------------------------------------------------| | Non-releasable memory | 183450 KiB | 337456 KiB | 31846 GiB | 31846 GiB | | from large pool | 174848 KiB | 305536 KiB | 31061 GiB | 31061 GiB | | from small pool | 8602 KiB | 43066 KiB | 784 GiB | 784 GiB | |---------------------------------------------------------------------------| | Allocations | 868 | 1343 | 18126 K | 18125 K | | from large pool | 82 | 334 | 5063 K | 5063 K | | from small pool | 786 | 1235 | 13063 K | 13062 K | |---------------------------------------------------------------------------| | Active allocs | 868 | 1343 | 18126 K | 18125 K | | from large pool | 82 | 334 | 5063 K | 5063 K | | from small pool | 786 | 1235 | 13063 K | 13062 K | |---------------------------------------------------------------------------| | GPU reserved segments | 266 | 266 | 266 | 0 | | from large pool | 231 | 231 | 231 | 0 | | from small pool | 35 | 35 | 35 | 0 | |---------------------------------------------------------------------------| | Non-releasable allocs | 42 | 91 | 7717 K | 7717 K | | from large pool | 19 | 25 | 975 K | 975 K | | from small pool | 23 | 73 | 6741 K | 6741 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2712, |B|=0.1645, |∇A|=1.242e-05, |∇B|=1.544e-05, |LoRA(x)|=3.399e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2729, |B|=0.1675, |∇A|=6.177e-06, |∇B|=1.446e-05, |LoRA(x)|=3.671e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2637, |B|=0.1373, |∇A|=8.059e-06, |∇B|=2.125e-05, |LoRA(x)|=4.015e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2545, |B|=0.142, |∇A|=1.518e-05, |∇B|=4.65e-05, |LoRA(x)|=2.158e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2809, |B|=0.1687, |∇A|=2.046e-05, |∇B|=1.587e-05, |LoRA(x)|=8.516e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2528, |B|=0.1381, |∇A|=5.852e-06, |∇B|=4.107e-05, |LoRA(x)|=4.954e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2652, |B|=0.1512, |∇A|=1.248e-05, |∇B|=1.809e-05, |LoRA(x)|=2.561e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2559, |B|=0.1557, |∇A|=7.45e-06, |∇B|=1.748e-05, |LoRA(x)|=3.104e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2501, |B|=0.1275, |∇A|=1.187e-05, |∇B|=3.63e-05, |LoRA(x)|=3.725e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2552, |B|=0.1281, |∇A|=1.236e-05, |∇B|=5.365e-05, |LoRA(x)|=3.199e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2708, |B|=0.1609, |∇A|=1.571e-05, |∇B|=1.942e-05, |LoRA(x)|=9.91e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2588, |B|=0.1337, |∇A|=9.242e-06, |∇B|=6.657e-05, |LoRA(x)|=2.476e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2595, |B|=0.1515, |∇A|=1.11e-05, |∇B|=2.483e-05, |LoRA(x)|=3.157e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2708, |B|=0.1635, |∇A|=1.243e-05, |∇B|=2.197e-05, |LoRA(x)|=3.55e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2464, |B|=0.1088, |∇A|=8.384e-06, |∇B|=5.13e-05, |LoRA(x)|=4.268e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2503, |B|=0.1339, |∇A|=2.356e-05, |∇B|=6.333e-05, |LoRA(x)|=1.632e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.283, |B|=0.1723, |∇A|=1.912e-05, |∇B|=2.311e-05, |LoRA(x)|=1.065e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2517, |B|=0.131, |∇A|=1.063e-05, |∇B|=7.487e-05, |LoRA(x)|=3.153e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2595, |B|=0.1643, |∇A|=1.092e-05, |∇B|=2.535e-05, |LoRA(x)|=3.189e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2756, |B|=0.1721, |∇A|=2.031e-05, |∇B|=2.696e-05, |LoRA(x)|=3.227e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2468, |B|=0.1189, |∇A|=1.566e-05, |∇B|=4.28e-05, |LoRA(x)|=2.765e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.251, |B|=0.126, |∇A|=3.775e-05, |∇B|=6.053e-05, |LoRA(x)|=1.529e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2486, |B|=0.1463, |∇A|=2.647e-05, |∇B|=2.87e-05, |LoRA(x)|=6.15e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2441, |B|=0.1165, |∇A|=8.327e-06, |∇B|=7.226e-05, |LoRA(x)|=3.691e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.25, |B|=0.1638, |∇A|=1.185e-05, |∇B|=3.286e-05, |LoRA(x)|=3.044e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2585, |B|=0.1636, |∇A|=2.955e-05, |∇B|=2.798e-05, |LoRA(x)|=3.135e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2335, |B|=0.1084, |∇A|=7.934e-06, |∇B|=2.614e-05, |LoRA(x)|=3.104e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2523, |B|=0.1177, |∇A|=3.329e-05, |∇B|=5.98e-05, |LoRA(x)|=1.712e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.251, |B|=0.1328, |∇A|=1.922e-05, |∇B|=1.731e-05, |LoRA(x)|=6.409e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2312, |B|=0.102, |∇A|=1.664e-06, |∇B|=5.231e-05, |LoRA(x)|=9.2e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2522, |B|=0.1596, |∇A|=6.446e-06, |∇B|=2.722e-05, |LoRA(x)|=5.28e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2534, |B|=0.1391, |∇A|=2.139e-05, |∇B|=1.514e-05, |LoRA(x)|=3.592e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2337, |B|=0.08995, |∇A|=4.012e-06, |∇B|=1.909e-05, |LoRA(x)|=3.959e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2354, |B|=0.09846, |∇A|=1.088e-05, |∇B|=6.078e-05, |LoRA(x)|=1.886e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2339, |B|=0.1149, |∇A|=8.083e-06, |∇B|=9.786e-06, |LoRA(x)|=5.218e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2258, |B|=0.09958, |∇A|=8.421e-07, |∇B|=4.586e-05, |LoRA(x)|=1.574e+05, B≠0=12288
In [7]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2852030396461487 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.1868315190076828 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.28894203901290894 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19284531474113464 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.2786567807197571 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.15678970515727997 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26653891801834106 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16393844783306122 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3024210035800934 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.19977959990501404 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.266163170337677 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16271300613880157 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.28112292289733887 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17357999086380005 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.26969343423843384 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17819923162460327 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26163023710250854 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14616072177886963 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.26657772064208984 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14856967329978943 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2862350344657898 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1856018602848053 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2755448818206787 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15843509137630463 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.27186107635498047 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17322131991386414 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.285927951335907 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1866149604320526 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25501346588134766 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12291982769966125 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.2613106966018677 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15331488847732544 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.3022487163543701 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20070664584636688 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2643374800682068 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.1518675535917282 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2738485038280487 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1887315809726715 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29151996970176697 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.196702241897583 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.25722846388816833 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13603520393371582 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.260562539100647 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1413680464029312 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.25821197032928467 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16667717695236206 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25490492582321167 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13568715751171112 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2612036466598511 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.186976820230484 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2723938226699829 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18866805732250214 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24070772528648376 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12169525027275085 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26227468252182007 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.1335488259792328 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2600998878479004 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15008249878883362 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2391623854637146 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11444054543972015 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2612508535385132 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18396827578544617 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2618298828601837 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15817555785179138 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24128463864326477 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10050331801176071 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24433913826942444 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11103056371212006 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24154244363307953 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.13013333082199097 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23339377343654633 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11259599775075912 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.3497 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.3571 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 40.8116 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.0836 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.6761 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.3207 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.0872 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.2521 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.0952 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.6016 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.5906 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.2036 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.0001 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.6236 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.1599 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.0324 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.2031 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.7763 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.0907 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.1833 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.4288 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.5808 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.1650 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.6503 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.5464 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.4745 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.4875 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.2999 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.3883 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.6219 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.3501 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.8545 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.8315 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.1244 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.1303 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 21.8819 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 38.9474 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.5128 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.3343 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.6666 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.6672 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.5756 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.1686 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.1348 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.6448 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.2993 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.4788 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.4280 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.0708 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.4088 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.6041 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.7302 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.2148 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.3467 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.7528 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.2596 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.7303 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.6779 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.3745 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.6143 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.2654 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.0703 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.1617 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.8125 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.3778 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.4962 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.5389 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.0226 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.4220 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.0077 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.3166 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5074 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.065723419189453 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9603335857391357 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0488741397857666 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9554517269134521 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.924666166305542 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9596352577209473 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8989819288253784 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9143486022949219 distilbert.transformer.layer.1.attention.q_lin.scale_out 1.9937138557434082 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9632105827331543 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9829158782958984 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.972657561302185 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9284312725067139 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9239709377288818 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9023430347442627 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9340471029281616 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0029258728027344 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9627807140350342 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.041708469390869 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9827865362167358 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9156968593597412 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.923585295677185 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.894546389579773 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9101351499557495 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0140867233276367 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9457342624664307 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0678651332855225 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9433894157409668 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.932185411453247 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9020198583602905 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9203736782073975 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.919033408164978 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0426063537597656 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9197235107421875 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.0591018199920654 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9308593273162842 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8353819847106934 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9478187561035156 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9222724437713623 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8530683517456055 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9434971809387207 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.901269793510437 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.065319061279297 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9098787307739258 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.9001773595809937 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9044647216796875 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8900084495544434 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9580339193344116 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.1630 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9237 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7412 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8128 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.0365 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9269 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2719 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7150 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.0822 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0276 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.6968 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1462 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9471 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8715 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2138 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.2321 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.1507 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9045 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.3555 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.5349 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.5577 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8781 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.1729 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.5444 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.5551 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4495 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.1008 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5170 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.0951 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2702 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.8018 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7043 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.4434 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7162 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.9523 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0160 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.7578 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3412 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.1367 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.1710 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 54.9056 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.3334 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9395 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.4642 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.4265 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3221 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2530 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5957 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.3770124912261963 distilbert.transformer.layer.0.attention.q_lin.m_in 0.27724745869636536 distilbert.transformer.layer.0.attention.k_lin.m_out 0.37355756759643555 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2839737832546234 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2701805531978607 distilbert.transformer.layer.0.attention.v_lin.m_in 0.27773016691207886 distilbert.transformer.layer.0.attention.out_lin.m_out 0.24917708337306976 distilbert.transformer.layer.0.attention.out_lin.m_in 0.24767015874385834 distilbert.transformer.layer.1.attention.q_lin.m_out 0.3190837800502777 distilbert.transformer.layer.1.attention.q_lin.m_in 0.27320823073387146 distilbert.transformer.layer.1.attention.k_lin.m_out 0.3182787597179413 distilbert.transformer.layer.1.attention.k_lin.m_in 0.2656557559967041 distilbert.transformer.layer.1.attention.v_lin.m_out 0.25530683994293213 distilbert.transformer.layer.1.attention.v_lin.m_in 0.25207918882369995 distilbert.transformer.layer.1.attention.out_lin.m_out 0.24881324172019958 distilbert.transformer.layer.1.attention.out_lin.m_in 0.2508460581302643 distilbert.transformer.layer.2.attention.q_lin.m_out 0.3128194510936737 distilbert.transformer.layer.2.attention.q_lin.m_in 0.258196085691452 distilbert.transformer.layer.2.attention.k_lin.m_out 0.35070037841796875 distilbert.transformer.layer.2.attention.k_lin.m_in 0.2925625741481781 distilbert.transformer.layer.2.attention.v_lin.m_out 0.2368810623884201 distilbert.transformer.layer.2.attention.v_lin.m_in 0.24671325087547302 distilbert.transformer.layer.2.attention.out_lin.m_out 0.25679904222488403 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23690304160118103 distilbert.transformer.layer.3.attention.q_lin.m_out 0.3336006999015808 distilbert.transformer.layer.3.attention.q_lin.m_in 0.26909562945365906 distilbert.transformer.layer.3.attention.k_lin.m_out 0.3726501166820526 distilbert.transformer.layer.3.attention.k_lin.m_in 0.28335466980934143 distilbert.transformer.layer.3.attention.v_lin.m_out 0.2500506043434143 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23332524299621582 distilbert.transformer.layer.3.attention.out_lin.m_out 0.25282174348831177 distilbert.transformer.layer.3.attention.out_lin.m_in 0.24877238273620605 distilbert.transformer.layer.4.attention.q_lin.m_out 0.35002803802490234 distilbert.transformer.layer.4.attention.q_lin.m_in 0.24191546440124512 distilbert.transformer.layer.4.attention.k_lin.m_out 0.37940168380737305 distilbert.transformer.layer.4.attention.k_lin.m_in 0.25263217091560364 distilbert.transformer.layer.4.attention.v_lin.m_out 0.22971975803375244 distilbert.transformer.layer.4.attention.v_lin.m_in 0.24351906776428223 distilbert.transformer.layer.4.attention.out_lin.m_out 0.27787744998931885 distilbert.transformer.layer.4.attention.out_lin.m_in 0.2244793176651001 distilbert.transformer.layer.5.attention.q_lin.m_out 0.31838345527648926 distilbert.transformer.layer.5.attention.q_lin.m_in 0.24490538239479065 distilbert.transformer.layer.5.attention.k_lin.m_out 0.3695001006126404 distilbert.transformer.layer.5.attention.k_lin.m_in 0.24230703711509705 distilbert.transformer.layer.5.attention.v_lin.m_out 0.25600332021713257 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22535942494869232 distilbert.transformer.layer.5.attention.out_lin.m_out 0.2510755956172943 distilbert.transformer.layer.5.attention.out_lin.m_in 0.22795787453651428 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.0632 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.2798 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.1138 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3424 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.8511 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.1935 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.2815 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7289 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.4923 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2457 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.2975 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.5561 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.1756 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6442 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.8119 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8755 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.0380 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.4882 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.1821 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.5475 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.7905 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6471 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.2760 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2415 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.6200 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.7229 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.7838 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.3737 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.0698 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1958 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.0340 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.6839 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.5137 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.0286 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.1788 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.3409 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.5298 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.0859 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.1835 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.2817 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.6623 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.4598 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.5437 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.0907 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3540 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0294 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3183 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.3993
In [8]:
def set_all_lora_dropout(model, new_dropout_rate):
for module in model.modules():
if isinstance(module, LoRALayer):
module.dropout.p = new_dropout_rate
def print_dropout_rates(model):
for name, module in model.named_modules():
if isinstance(module, LoRALayer):
print(f"{name}.dropout.p = {module.dropout.p}")
def split_lora_dora_params(model):
lora_A_params = []
lora_B_params = []
m_params = []
scale_params = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if "lora.A" in name:
lora_A_params.append(param)
elif "lora.B" in name:
lora_B_params.append(param)
elif name.endswith("m_in") or name.endswith("m_out"):
m_params.append(param)
elif "scale" in name:
scale_params.append(param)
return {
"lora_A": lora_A_params,
"lora_B": lora_B_params,
"m": m_params,
"scale": scale_params,
}
def create_custom_optimizer(model, base_lr=1e-4, lr_B_scale=10.0, lr_scale_params=0.2, weight_decay=0.01):
param_groups = split_lora_dora_params(model)
optimizer = torch.optim.AdamW([
{"params": param_groups["lora_A"], "lr": base_lr},
{"params": param_groups["lora_B"], "lr": base_lr * lr_B_scale},
{"params": param_groups["m"], "lr": base_lr},
{"params": param_groups["scale"], "lr": base_lr * lr_scale_params},
], weight_decay=weight_decay)
return optimizer
In [9]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=3e-3, ###########
lr_B_scale=0.5, #############
lr_scale_params=0.75, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:55:10, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.171000 | 0.198666 | 0.928800 | 0.928650 |
100 | 0.152800 | 0.207403 | 0.928000 | 0.927965 |
150 | 0.170100 | 0.206528 | 0.922400 | 0.922406 |
200 | 0.111000 | 0.218847 | 0.927200 | 0.926929 |
250 | 0.139300 | 0.210196 | 0.928000 | 0.927940 |
300 | 0.142700 | 0.200031 | 0.929600 | 0.929578 |
350 | 0.144000 | 0.204206 | 0.924800 | 0.924776 |
400 | 0.117900 | 0.217714 | 0.930400 | 0.930310 |
450 | 0.125400 | 0.224378 | 0.924800 | 0.924822 |
500 | 0.148100 | 0.212716 | 0.926400 | 0.926400 |
550 | 0.125000 | 0.210286 | 0.928800 | 0.928650 |
600 | 0.129100 | 0.214049 | 0.929600 | 0.929589 |
650 | 0.102500 | 0.222723 | 0.927200 | 0.927171 |
700 | 0.102100 | 0.226605 | 0.928000 | 0.927927 |
750 | 0.111200 | 0.229735 | 0.925600 | 0.925606 |
800 | 0.124000 | 0.234266 | 0.919200 | 0.919298 |
850 | 0.098500 | 0.229973 | 0.926400 | 0.926400 |
900 | 0.085800 | 0.230217 | 0.924800 | 0.924650 |
950 | 0.090300 | 0.235953 | 0.928000 | 0.927927 |
1000 | 0.081900 | 0.242805 | 0.920800 | 0.920794 |
1050 | 0.096500 | 0.238420 | 0.926400 | 0.926237 |
1100 | 0.101600 | 0.233878 | 0.927200 | 0.927133 |
1150 | 0.066800 | 0.236318 | 0.929600 | 0.929529 |
1200 | 0.091300 | 0.236734 | 0.925600 | 0.925594 |
1250 | 0.077800 | 0.235389 | 0.928000 | 0.927953 |
1300 | 0.083900 | 0.239438 | 0.928800 | 0.928760 |
1350 | 0.089700 | 0.237017 | 0.927200 | 0.927092 |
1400 | 0.098800 | 0.233933 | 0.930400 | 0.930336 |
1450 | 0.072900 | 0.235942 | 0.927200 | 0.927194 |
1500 | 0.096200 | 0.238092 | 0.929600 | 0.929578 |
1550 | 0.171300 | 0.236920 | 0.928000 | 0.927965 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12777 MiB | 327719 GiB | 327718 GiB | | from large pool | 541440 KiB | 12712 MiB | 326500 GiB | 326500 GiB | | from small pool | 18003 KiB | 67 MiB | 1218 GiB | 1218 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12777 MiB | 327719 GiB | 327718 GiB | | from large pool | 541440 KiB | 12712 MiB | 326500 GiB | 326500 GiB | | from small pool | 18003 KiB | 67 MiB | 1218 GiB | 1218 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12774 MiB | 327414 GiB | 327413 GiB | | from large pool | 539228 KiB | 12708 MiB | 326202 GiB | 326201 GiB | | from small pool | 18002 KiB | 67 MiB | 1212 GiB | 1212 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 25404 MiB | 12346 MiB | | from large pool | 12988 MiB | 12988 MiB | 25272 MiB | 12284 MiB | | from small pool | 70 MiB | 70 MiB | 132 MiB | 62 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186029 KiB | 403075 KiB | 54058 GiB | 54058 GiB | | from large pool | 179456 KiB | 396800 KiB | 52747 GiB | 52747 GiB | | from small pool | 6573 KiB | 43066 KiB | 1311 GiB | 1311 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1343 | 30218 K | 30217 K | | from large pool | 80 | 334 | 8442 K | 8442 K | | from small pool | 780 | 1235 | 21775 K | 21775 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1343 | 30218 K | 30217 K | | from large pool | 80 | 334 | 8442 K | 8442 K | | from small pool | 780 | 1235 | 21775 K | 21775 K | |---------------------------------------------------------------------------| | GPU reserved segments | 263 | 266 | 503 | 240 | | from large pool | 228 | 231 | 437 | 209 | | from small pool | 35 | 35 | 66 | 31 | |---------------------------------------------------------------------------| | Non-releasable allocs | 39 | 91 | 13123 K | 13123 K | | from large pool | 19 | 25 | 1645 K | 1645 K | | from small pool | 20 | 73 | 11477 K | 11477 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2923, |B|=0.1886, |∇A|=1.603e-05, |∇B|=1.49e-05, |LoRA(x)|=3.698e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.295, |B|=0.1946, |∇A|=8.035e-06, |∇B|=1.447e-05, |LoRA(x)|=4.196e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2844, |B|=0.1591, |∇A|=1.076e-05, |∇B|=1.945e-05, |LoRA(x)|=4.082e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.271, |B|=0.166, |∇A|=1.771e-05, |∇B|=3.743e-05, |LoRA(x)|=2.424e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.3087, |B|=0.2015, |∇A|=2.777e-05, |∇B|=1.693e-05, |LoRA(x)|=9.04e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2703, |B|=0.1648, |∇A|=8.989e-06, |∇B|=4.098e-05, |LoRA(x)|=4.44e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2863, |B|=0.1758, |∇A|=1.646e-05, |∇B|=1.71e-05, |LoRA(x)|=2.87e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2757, |B|=0.1802, |∇A|=9.736e-06, |∇B|=1.758e-05, |LoRA(x)|=3.28e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2666, |B|=0.1485, |∇A|=1.34e-05, |∇B|=2.776e-05, |LoRA(x)|=4.203e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2704, |B|=0.1508, |∇A|=1.454e-05, |∇B|=3.814e-05, |LoRA(x)|=3.212e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2926, |B|=0.1876, |∇A|=2.232e-05, |∇B|=2.009e-05, |LoRA(x)|=9.228e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2796, |B|=0.1607, |∇A|=1.426e-05, |∇B|=6.012e-05, |LoRA(x)|=2.112e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2781, |B|=0.1749, |∇A|=1.316e-05, |∇B|=2.128e-05, |LoRA(x)|=3.467e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2924, |B|=0.1885, |∇A|=1.639e-05, |∇B|=2.285e-05, |LoRA(x)|=3.962e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2582, |B|=0.1257, |∇A|=7.738e-06, |∇B|=2.115e-05, |LoRA(x)|=5.115e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2642, |B|=0.1559, |∇A|=2.69e-05, |∇B|=4.541e-05, |LoRA(x)|=1.683e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.3067, |B|=0.2024, |∇A|=2.343e-05, |∇B|=2.157e-05, |LoRA(x)|=1.145e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2683, |B|=0.1544, |∇A|=1.34e-05, |∇B|=5.394e-05, |LoRA(x)|=3.488e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2803, |B|=0.1901, |∇A|=1.35e-05, |∇B|=2.353e-05, |LoRA(x)|=3.44e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2952, |B|=0.1982, |∇A|=2.459e-05, |∇B|=2.6e-05, |LoRA(x)|=3.69e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2612, |B|=0.1387, |∇A|=1.571e-05, |∇B|=2.234e-05, |LoRA(x)|=3.344e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2658, |B|=0.1439, |∇A|=3.732e-05, |∇B|=3.537e-05, |LoRA(x)|=1.835e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2639, |B|=0.1687, |∇A|=2.518e-05, |∇B|=1.91e-05, |LoRA(x)|=6.809e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2576, |B|=0.139, |∇A|=1.165e-05, |∇B|=4.748e-05, |LoRA(x)|=3.457e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.266, |B|=0.1887, |∇A|=1.028e-05, |∇B|=2.133e-05, |LoRA(x)|=3.756e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2772, |B|=0.1906, |∇A|=2.782e-05, |∇B|=2.17e-05, |LoRA(x)|=3.853e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2456, |B|=0.1248, |∇A|=7.548e-06, |∇B|=1.231e-05, |LoRA(x)|=3.302e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.266, |B|=0.1366, |∇A|=3.234e-05, |∇B|=2.639e-05, |LoRA(x)|=2.125e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.264, |B|=0.1525, |∇A|=1.45e-05, |∇B|=8.94e-06, |LoRA(x)|=8.223e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2397, |B|=0.1167, |∇A|=1.681e-06, |∇B|=1.305e-05, |LoRA(x)|=9.592e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2693, |B|=0.186, |∇A|=6.757e-06, |∇B|=1.821e-05, |LoRA(x)|=6e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2672, |B|=0.1606, |∇A|=2.107e-05, |∇B|=1.107e-05, |LoRA(x)|=4.379e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2468, |B|=0.1033, |∇A|=7.268e-06, |∇B|=8.619e-06, |LoRA(x)|=3.511e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2494, |B|=0.1147, |∇A|=1.182e-05, |∇B|=1.262e-05, |LoRA(x)|=2.164e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2487, |B|=0.1329, |∇A|=7.639e-06, |∇B|=3.873e-06, |LoRA(x)|=6.733e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2379, |B|=0.1144, |∇A|=2.809e-07, |∇B|=5.887e-06, |LoRA(x)|=2.086e+05, B≠0=12288
In [10]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.28593704104423523 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18714945018291473 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.28985413908958435 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1930702030658722 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.27944302558898926 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.15703189373016357 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26731884479522705 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16420790553092957 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3031120300292969 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.20004507899284363 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26663610339164734 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16295364499092102 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2813974618911743 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17370526492595673 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2702440321445465 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17850112915039062 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2619485855102539 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.1464267075061798 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2668524980545044 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14889678359031677 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2867199778556824 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.18584245443344116 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2757873833179474 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15874330699443817 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.27296027541160583 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.1735691875219345 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28685471415519714 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1868443489074707 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.255046546459198 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12340390682220459 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26136600971221924 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15360134840011597 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.302565336227417 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.2009706199169159 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2644515633583069 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15223130583763123 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.27478688955307007 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.18895143270492554 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2912011742591858 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19694802165031433 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2578084468841553 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13630230724811554 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2611168622970581 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14164264500141144 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.2590485215187073 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16692781448364258 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.254573792219162 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13625115156173706 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.26176127791404724 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.187218576669693 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2729148268699646 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1890524923801422 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24128566682338715 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12205924838781357 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26241645216941833 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13388215005397797 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.26028382778167725 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15029317140579224 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23894798755645752 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.1147853285074234 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26214921474456787 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18426935374736786 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2623964548110962 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15851348638534546 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2414218783378601 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10067234933376312 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24519243836402893 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11163420975208282 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24182814359664917 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1305427998304367 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23493000864982605 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11285355687141418 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.4405 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.3895 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 40.9122 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1172 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.7739 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.3608 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.1675 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.2920 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.1837 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.6638 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.7346 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.2397 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.0452 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.6523 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.2249 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.0732 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.2578 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.8114 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1229 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.2260 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.4913 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.6494 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.2518 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.6957 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.6635 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5085 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.5898 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.3329 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4050 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.6652 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.3746 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.8932 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.8806 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.1776 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.1942 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 21.9220 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.0540 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.5379 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.3258 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7049 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.7462 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.6216 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.2570 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.1840 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.7492 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.3749 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.4350 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.4855 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.1394 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.4493 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.6975 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.7736 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.3077 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.4064 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.7813 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.2943 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.7713 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.7477 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.3355 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.6588 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.3640 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1069 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.2713 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.8616 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.4147 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.5707 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6305 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.0943 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.4835 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.1080 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7199 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5415 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0673470497131348 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.959618330001831 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0504937171936035 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9555171728134155 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9283709526062012 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9596271514892578 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9006755352020264 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9129583835601807 distilbert.transformer.layer.1.attention.q_lin.scale_out 1.9956352710723877 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9624356031417847 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.984641432762146 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9718208312988281 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9293396472930908 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.922268271446228 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9018371105194092 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9323279857635498 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0033369064331055 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9628219604492188 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0424909591674805 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9834234714508057 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.915880560874939 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9204366207122803 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.8952863216400146 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9077247381210327 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.016188144683838 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9464267492294312 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0708353519439697 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9410731792449951 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9289363622665405 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9007830619812012 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9172041416168213 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9178813695907593 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.042696714401245 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9188411235809326 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.06121826171875 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9304380416870117 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8320510387420654 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.946484088897705 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9181432723999023 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8494853973388672 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9441440105438232 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.900758147239685 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.066316843032837 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9095886945724487 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.896740198135376 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9020140171051025 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.887932300567627 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9566638469696045 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.2229 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9106 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7934 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8139 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.1541 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9282 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.3326 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.6839 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.1401 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0107 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7442 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1282 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9817 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8315 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2108 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1900 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.1699 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9118 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.3825 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.5532 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.5693 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8041 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.1984 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4889 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.6185 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4670 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.1903 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.4578 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.0061 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2434 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.7205 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.6783 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.4506 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7003 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.0178 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0089 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6836 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3096 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0278 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0867 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 54.9359 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.3308 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9806 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.4626 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3404 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.2618 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2070 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5645 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.38056397438049316 distilbert.transformer.layer.0.attention.q_lin.m_in 0.27801811695098877 distilbert.transformer.layer.0.attention.k_lin.m_out 0.37722188234329224 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2856902778148651 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2773345112800598 distilbert.transformer.layer.0.attention.v_lin.m_in 0.2796667814254761 distilbert.transformer.layer.0.attention.out_lin.m_out 0.2537873089313507 distilbert.transformer.layer.0.attention.out_lin.m_in 0.2489577978849411 distilbert.transformer.layer.1.attention.q_lin.m_out 0.323003351688385 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2741820216178894 distilbert.transformer.layer.1.attention.k_lin.m_out 0.32190388441085815 distilbert.transformer.layer.1.attention.k_lin.m_in 0.26650986075401306 distilbert.transformer.layer.1.attention.v_lin.m_out 0.25884923338890076 distilbert.transformer.layer.1.attention.v_lin.m_in 0.25241196155548096 distilbert.transformer.layer.1.attention.out_lin.m_out 0.25029632449150085 distilbert.transformer.layer.1.attention.out_lin.m_in 0.2511628568172455 distilbert.transformer.layer.2.attention.q_lin.m_out 0.3147231936454773 distilbert.transformer.layer.2.attention.q_lin.m_in 0.26011890172958374 distilbert.transformer.layer.2.attention.k_lin.m_out 0.3528413772583008 distilbert.transformer.layer.2.attention.k_lin.m_in 0.29503440856933594 distilbert.transformer.layer.2.attention.v_lin.m_out 0.23936259746551514 distilbert.transformer.layer.2.attention.v_lin.m_in 0.2455853670835495 distilbert.transformer.layer.2.attention.out_lin.m_out 0.25981783866882324 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23666052520275116 distilbert.transformer.layer.3.attention.q_lin.m_out 0.3375234305858612 distilbert.transformer.layer.3.attention.q_lin.m_in 0.2719265818595886 distilbert.transformer.layer.3.attention.k_lin.m_out 0.37764236330986023 distilbert.transformer.layer.3.attention.k_lin.m_in 0.281810462474823 distilbert.transformer.layer.3.attention.v_lin.m_out 0.24766501784324646 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23446033895015717 distilbert.transformer.layer.3.attention.out_lin.m_out 0.2505345344543457 distilbert.transformer.layer.3.attention.out_lin.m_in 0.2496027946472168 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3517087697982788 distilbert.transformer.layer.4.attention.q_lin.m_in 0.2426379770040512 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3833864629268646 distilbert.transformer.layer.4.attention.k_lin.m_in 0.2542974054813385 distilbert.transformer.layer.4.attention.v_lin.m_out 0.22918450832366943 distilbert.transformer.layer.4.attention.v_lin.m_in 0.24464242160320282 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2750687599182129 distilbert.transformer.layer.4.attention.out_lin.m_in 0.2238387167453766 distilbert.transformer.layer.5.attention.q_lin.m_out 0.32097992300987244 distilbert.transformer.layer.5.attention.q_lin.m_in 0.2463197410106659 distilbert.transformer.layer.5.attention.k_lin.m_out 0.37192124128341675 distilbert.transformer.layer.5.attention.k_lin.m_in 0.244779571890831 distilbert.transformer.layer.5.attention.v_lin.m_out 0.2550698518753052 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22488240897655487 distilbert.transformer.layer.5.attention.out_lin.m_out 0.25121861696243286 distilbert.transformer.layer.5.attention.out_lin.m_in 0.2287808656692505 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.2019 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.2984 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.2189 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3540 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.0525 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2157 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.4275 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7476 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.5930 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2563 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.3709 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.5824 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.2795 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6517 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.8824 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8791 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.1133 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.5387 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.2395 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.5863 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.8512 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6610 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.3322 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2583 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.7188 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.7568 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.9247 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.3524 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.9878 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2346 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.9857 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.6994 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.5542 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.0776 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.2873 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.3746 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.5224 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1099 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0823 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.2891 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.7517 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5089 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.6481 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.1404 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3037 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0170 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3243 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4163
Freeze LoRA parameters for layer 5 FFN¶
In [11]:
# Freeze LoRA parameters for layer.5.ffn.lin2
for name, param in trainer_ddora_all_attn.model.named_parameters():#
if "transformer.layer.5.ffn.lin2" in name and "lora" in name:
param.requires_grad = False
print(f"FROZEN: {name}")
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=1,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=1e-3, ###########
lr_B_scale=1.0, #############
lr_scale_params=1.0, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
FROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.A FROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.B
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[782/782 1:20:33, Epoch 1/1]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.117500 | 0.208835 | 0.930400 | 0.930336 |
100 | 0.150800 | 0.206763 | 0.928000 | 0.927940 |
150 | 0.162700 | 0.207248 | 0.924000 | 0.924006 |
200 | 0.109600 | 0.207848 | 0.930400 | 0.930310 |
250 | 0.136800 | 0.213334 | 0.927200 | 0.927227 |
300 | 0.141400 | 0.206167 | 0.928800 | 0.928735 |
350 | 0.143200 | 0.206361 | 0.927200 | 0.927194 |
400 | 0.114100 | 0.212035 | 0.925600 | 0.925656 |
450 | 0.127800 | 0.209920 | 0.928800 | 0.928772 |
500 | 0.142900 | 0.205849 | 0.930400 | 0.930323 |
550 | 0.126100 | 0.205841 | 0.932000 | 0.931912 |
600 | 0.128800 | 0.207515 | 0.928000 | 0.927965 |
650 | 0.100400 | 0.207170 | 0.930400 | 0.930310 |
700 | 0.102100 | 0.209562 | 0.930400 | 0.930336 |
750 | 0.107500 | 0.210749 | 0.930400 | 0.930336 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 558963 KiB | 12777 MiB | 392916 GiB | 392915 GiB | | from large pool | 541440 KiB | 12712 MiB | 391456 GiB | 391455 GiB | | from small pool | 17523 KiB | 67 MiB | 1459 GiB | 1459 GiB | |---------------------------------------------------------------------------| | Active memory | 558963 KiB | 12777 MiB | 392916 GiB | 392915 GiB | | from large pool | 541440 KiB | 12712 MiB | 391456 GiB | 391455 GiB | | from small pool | 17523 KiB | 67 MiB | 1459 GiB | 1459 GiB | |---------------------------------------------------------------------------| | Requested memory | 556750 KiB | 12774 MiB | 392587 GiB | 392586 GiB | | from large pool | 539228 KiB | 12708 MiB | 391134 GiB | 391134 GiB | | from small pool | 17522 KiB | 67 MiB | 1452 GiB | 1452 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 12940 MiB | 13058 MiB | 37632 MiB | 24692 MiB | | from large pool | 12872 MiB | 12988 MiB | 37440 MiB | 24568 MiB | | from small pool | 68 MiB | 70 MiB | 192 MiB | 124 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186509 KiB | 403075 KiB | 69510 GiB | 69510 GiB | | from large pool | 179456 KiB | 396800 KiB | 67937 GiB | 67937 GiB | | from small pool | 7053 KiB | 43066 KiB | 1572 GiB | 1572 GiB | |---------------------------------------------------------------------------| | Allocations | 856 | 1343 | 36234 K | 36233 K | | from large pool | 80 | 334 | 10124 K | 10124 K | | from small pool | 776 | 1235 | 26109 K | 26109 K | |---------------------------------------------------------------------------| | Active allocs | 856 | 1343 | 36234 K | 36233 K | | from large pool | 80 | 334 | 10124 K | 10124 K | | from small pool | 776 | 1235 | 26109 K | 26109 K | |---------------------------------------------------------------------------| | GPU reserved segments | 259 | 266 | 736 | 477 | | from large pool | 225 | 231 | 640 | 415 | | from small pool | 34 | 35 | 96 | 62 | |---------------------------------------------------------------------------| | Non-releasable allocs | 37 | 91 | 15948 K | 15948 K | | from large pool | 19 | 25 | 2009 K | 2009 K | | from small pool | 18 | 73 | 13938 K | 13938 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2869, |B|=0.1876, |∇A|=1.333e-05, |∇B|=1.196e-05, |LoRA(x)|=3.76e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2906, |B|=0.1935, |∇A|=6.433e-06, |∇B|=1.195e-05, |LoRA(x)|=4.247e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2801, |B|=0.1574, |∇A|=8.705e-06, |∇B|=1.581e-05, |LoRA(x)|=4.329e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.268, |B|=0.1646, |∇A|=1.558e-05, |∇B|=3.117e-05, |LoRA(x)|=2.49e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.304, |B|=0.2005, |∇A|=2.365e-05, |∇B|=1.405e-05, |LoRA(x)|=9.32e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2672, |B|=0.1635, |∇A|=8.24e-06, |∇B|=3.471e-05, |LoRA(x)|=4.23e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2818, |B|=0.1742, |∇A|=1.286e-05, |∇B|=1.379e-05, |LoRA(x)|=3.027e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.271, |B|=0.1789, |∇A|=7.098e-06, |∇B|=1.336e-05, |LoRA(x)|=3.711e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2626, |B|=0.1468, |∇A|=1.056e-05, |∇B|=2.244e-05, |LoRA(x)|=4.453e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.267, |B|=0.1492, |∇A|=1.1e-05, |∇B|=2.913e-05, |LoRA(x)|=3.652e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2876, |B|=0.1862, |∇A|=1.606e-05, |∇B|=1.513e-05, |LoRA(x)|=1.066e+05, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2764, |B|=0.1592, |∇A|=1.147e-05, |∇B|=4.788e-05, |LoRA(x)|=2.209e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2737, |B|=0.1739, |∇A|=1.058e-05, |∇B|=1.74e-05, |LoRA(x)|=3.618e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2877, |B|=0.1872, |∇A|=1.217e-05, |∇B|=1.751e-05, |LoRA(x)|=4.144e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2554, |B|=0.1239, |∇A|=6.755e-06, |∇B|=1.772e-05, |LoRA(x)|=5.225e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2617, |B|=0.1543, |∇A|=2.443e-05, |∇B|=3.976e-05, |LoRA(x)|=1.704e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.3031, |B|=0.2014, |∇A|=1.933e-05, |∇B|=1.793e-05, |LoRA(x)|=1.211e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2651, |B|=0.1528, |∇A|=1.109e-05, |∇B|=4.721e-05, |LoRA(x)|=3.653e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2756, |B|=0.1892, |∇A|=1.127e-05, |∇B|=1.954e-05, |LoRA(x)|=3.524e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2917, |B|=0.1971, |∇A|=2.061e-05, |∇B|=2.185e-05, |LoRA(x)|=3.752e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2585, |B|=0.1368, |∇A|=1.276e-05, |∇B|=1.827e-05, |LoRA(x)|=3.405e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2618, |B|=0.1422, |∇A|=3.202e-05, |∇B|=2.984e-05, |LoRA(x)|=1.839e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2599, |B|=0.1674, |∇A|=2.259e-05, |∇B|=1.615e-05, |LoRA(x)|=7.004e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2549, |B|=0.137, |∇A|=9.369e-06, |∇B|=4.113e-05, |LoRA(x)|=3.733e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2624, |B|=0.1878, |∇A|=9.001e-06, |∇B|=1.755e-05, |LoRA(x)|=4.085e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2735, |B|=0.1897, |∇A|=2.599e-05, |∇B|=1.944e-05, |LoRA(x)|=3.824e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2421, |B|=0.1227, |∇A|=8.159e-06, |∇B|=1.143e-05, |LoRA(x)|=3.423e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2631, |B|=0.1349, |∇A|=3.188e-05, |∇B|=2.495e-05, |LoRA(x)|=2.062e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2609, |B|=0.1508, |∇A|=1.334e-05, |∇B|=7.769e-06, |LoRA(x)|=8.132e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2387, |B|=0.1151, |∇A|=1.079e-06, |∇B|=1.035e-05, |LoRA(x)|=1.112e+05, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2637, |B|=0.1848, |∇A|=5.53e-06, |∇B|=1.5e-05, |LoRA(x)|=6.21e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2632, |B|=0.1592, |∇A|=1.749e-05, |∇B|=8.815e-06, |LoRA(x)|=4.364e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.242, |B|=0.1006, |∇A|=5.803e-06, |∇B|=6.981e-06, |LoRA(x)|=3.932e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2455, |B|=0.1123, |∇A|=1.09e-05, |∇B|=1.266e-05, |LoRA(x)|=2.144e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2427, |B|=0.131, |∇A|=5.69e-06, |∇B|=2.801e-06, |LoRA(x)|=6.67e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2349, |B|=0.1129, |∇A|=0, |∇B|=0, |LoRA(x)|=2.068e+05, B≠0=12288
In [12]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.28707337379455566 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18769995868206024 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2907640337944031 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1936057060956955 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.28023409843444824 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574789583683014 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26817581057548523 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16465796530246735 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.30428409576416016 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.20058000087738037 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2674410343170166 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16359667479991913 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2819472551345825 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17439055442810059 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2710683047771454 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17898575961589813 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2627217173576355 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.1469723880290985 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2670312821865082 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14931659400463104 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2877423167228699 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.18635958433151245 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27652254700660706 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15932457149028778 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.273947536945343 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.1739175021648407 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28790760040283203 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.18734395503997803 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25540855526924133 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12403412163257599 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26176193356513977 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15451809763908386 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.3031601309776306 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20152036845684052 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2652890384197235 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15302824974060059 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2757566273212433 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.18924658000469208 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2918875217437744 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718694686889648 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.25860539078712463 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704372942447662 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.26196131110191345 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226996898651123 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.25997695326805115 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748486459255219 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25504302978515625 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720043003559113 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.26243939995765686 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.1878441572189331 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.273539662361145 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.189828559756279 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2422509640455246 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12290792167186737 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26336467266082764 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13514696061611176 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.26100867986679077 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097534656524658 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23862293362617493 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11521776020526886 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2639756500720978 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18488191068172455 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2634556293487549 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15944841504096985 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2421378344297409 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10063609480857849 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.2455993890762329 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11262671649456024 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24293923377990723 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.13117605447769165 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23493000864982605 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11285355687141418 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.5814 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.4466 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 41.0152 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1966 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.8526 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.4586 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.2699 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.3672 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.3197 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.7974 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.9530 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.3197 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.1291 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.7248 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.3195 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.1494 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.3453 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.9148 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1472 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.3194 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.6311 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.8018 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.4618 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.7955 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.7868 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5725 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.7129 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.4076 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4647 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.7767 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.4144 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.9998 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.9848 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.3165 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.4062 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 22.0261 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.1650 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.6011 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.4144 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7537 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.8473 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.7296 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.3618 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.2695 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.8514 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.5482 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.5448 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.6095 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.2356 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.5284 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.7939 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.8519 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.4178 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.5407 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.8966 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.4117 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.8628 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.9398 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.2699 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.7667 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.5402 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1881 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.4260 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.9560 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.5026 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.7060 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6724 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.2978 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.6172 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.2762 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7199 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5415 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.076575756072998 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9624719619750977 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.059420585632324 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9576948881149292 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.935288429260254 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9621213674545288 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9077293872833252 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9148902893066406 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.003368377685547 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9634246826171875 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9907522201538086 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9740617275238037 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9336799383163452 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9234355688095093 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9045276641845703 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9323878288269043 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.010300636291504 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9660112857818604 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.053152322769165 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.986611247062683 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9197368621826172 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9203671216964722 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9001054763793945 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9074167013168335 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.021730422973633 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.949169397354126 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.077785015106201 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9428112506866455 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.933730125427246 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9019935131072998 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9261023998260498 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9190481901168823 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.045186758041382 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9197725057601929 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.065519094467163 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9318139553070068 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8299648761749268 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9480531215667725 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.917306900024414 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.84946870803833 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.948282241821289 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9058736562728882 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071871519088745 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9120490550994873 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.896397352218628 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9039411544799805 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8885447978973389 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9554933309555054 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.4880 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9874 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 58.0535 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8672 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3577 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9888 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.5378 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7379 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.3696 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0367 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.9233 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1861 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 54.1079 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8645 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2956 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1854 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.3724 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9977 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.6940 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.6386 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.6841 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8057 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.3392 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4810 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.7827 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.5376 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.3907 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5073 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.1504 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2825 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.9838 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7123 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5195 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7273 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.1384 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0466 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6293 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3511 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.9963 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0921 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.0667 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.4637 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 58.1353 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.5325 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3339 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3115 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2332 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5357 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.39016568660736084 distilbert.transformer.layer.0.attention.q_lin.m_in 0.2815612256526947 distilbert.transformer.layer.0.attention.k_lin.m_out 0.3865947425365448 distilbert.transformer.layer.0.attention.k_lin.m_in 0.28855225443840027 distilbert.transformer.layer.0.attention.v_lin.m_out 0.28526735305786133 distilbert.transformer.layer.0.attention.v_lin.m_in 0.28309836983680725 distilbert.transformer.layer.0.attention.out_lin.m_out 0.26175251603126526 distilbert.transformer.layer.0.attention.out_lin.m_in 0.2526334524154663 distilbert.transformer.layer.1.attention.q_lin.m_out 0.33136361837387085 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2759714126586914 distilbert.transformer.layer.1.attention.k_lin.m_out 0.32868993282318115 distilbert.transformer.layer.1.attention.k_lin.m_in 0.26953765749931335 distilbert.transformer.layer.1.attention.v_lin.m_out 0.26419997215270996 distilbert.transformer.layer.1.attention.v_lin.m_in 0.2546566128730774 distilbert.transformer.layer.1.attention.out_lin.m_out 0.2542833983898163 distilbert.transformer.layer.1.attention.out_lin.m_in 0.25218233466148376 distilbert.transformer.layer.2.attention.q_lin.m_out 0.3223288655281067 distilbert.transformer.layer.2.attention.q_lin.m_in 0.2641069293022156 distilbert.transformer.layer.2.attention.k_lin.m_out 0.363922119140625 distilbert.transformer.layer.2.attention.k_lin.m_in 0.2987942099571228 distilbert.transformer.layer.2.attention.v_lin.m_out 0.24425539374351501 distilbert.transformer.layer.2.attention.v_lin.m_in 0.24729464948177338 distilbert.transformer.layer.2.attention.out_lin.m_out 0.2655073404312134 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23778128623962402 distilbert.transformer.layer.3.attention.q_lin.m_out 0.343430757522583 distilbert.transformer.layer.3.attention.q_lin.m_in 0.2754462957382202 distilbert.transformer.layer.3.attention.k_lin.m_out 0.3848956525325775 distilbert.transformer.layer.3.attention.k_lin.m_in 0.28433045744895935 distilbert.transformer.layer.3.attention.v_lin.m_out 0.2534070909023285 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23691999912261963 distilbert.transformer.layer.3.attention.out_lin.m_out 0.26040372252464294 distilbert.transformer.layer.3.attention.out_lin.m_in 0.2518913149833679 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3547815680503845 distilbert.transformer.layer.4.attention.q_lin.m_in 0.24450191855430603 distilbert.transformer.layer.4.attention.k_lin.m_out 0.388192355632782 distilbert.transformer.layer.4.attention.k_lin.m_in 0.2566456198692322 distilbert.transformer.layer.4.attention.v_lin.m_out 0.22937855124473572 distilbert.transformer.layer.4.attention.v_lin.m_in 0.24749258160591125 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2755943536758423 distilbert.transformer.layer.4.attention.out_lin.m_in 0.2263449877500534 distilbert.transformer.layer.5.attention.q_lin.m_out 0.32583218812942505 distilbert.transformer.layer.5.attention.q_lin.m_in 0.2522851228713989 distilbert.transformer.layer.5.attention.k_lin.m_out 0.378021240234375 distilbert.transformer.layer.5.attention.k_lin.m_in 0.2485998570919037 distilbert.transformer.layer.5.attention.v_lin.m_out 0.2562413215637207 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22845476865768433 distilbert.transformer.layer.5.attention.out_lin.m_out 0.2536100745201111 distilbert.transformer.layer.5.attention.out_lin.m_in 0.2289573848247528 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.4592 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.3601 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.4781 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3834 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.2648 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2542 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.6388 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7940 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.8276 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2812 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.5565 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.6223 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.4099 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6938 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.9958 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8709 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.3210 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.6061 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.5516 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.6418 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.9932 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6817 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.4676 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2764 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.8944 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.8136 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 13.1322 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.4065 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.1533 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2861 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.2659 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.7452 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.6204 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.1060 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.4002 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.4113 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4958 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1393 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0271 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.3287 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.9104 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5670 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.7845 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.2074 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3198 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0539 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3842 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4135
Un-Freeze LoRA parameters for layer 5 FFN¶
In [13]:
# Un-freeze LoRA parameters for layer.5.ffn.lin2
for name, param in trainer_ddora_all_attn.model.named_parameters():
if "transformer.layer.5.ffn.lin2" in name and "lora" in name:
param.requires_grad = True
print(f"UNFROZEN: {name}")
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=1,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=5e-4, ###########
lr_B_scale=0.5, #############
lr_scale_params=1.0, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
UNFROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.A UNFROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.B
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[782/782 1:41:38, Epoch 1/1]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.090400 | 0.213297 | 0.932000 | 0.931973 |
100 | 0.115900 | 0.215432 | 0.928800 | 0.928783 |
150 | 0.126300 | 0.215974 | 0.928800 | 0.928772 |
200 | 0.082900 | 0.218342 | 0.928800 | 0.928735 |
250 | 0.111600 | 0.221422 | 0.928000 | 0.928021 |
300 | 0.119800 | 0.219625 | 0.928000 | 0.927965 |
350 | 0.123500 | 0.220197 | 0.925600 | 0.925617 |
400 | 0.092500 | 0.223016 | 0.926400 | 0.926442 |
450 | 0.112200 | 0.219613 | 0.929600 | 0.929554 |
500 | 0.125800 | 0.216974 | 0.929600 | 0.929529 |
550 | 0.114600 | 0.216819 | 0.928800 | 0.928722 |
600 | 0.125400 | 0.215816 | 0.928800 | 0.928772 |
650 | 0.096700 | 0.215295 | 0.932000 | 0.931950 |
700 | 0.098400 | 0.216751 | 0.929600 | 0.929566 |
750 | 0.105500 | 0.217510 | 0.928800 | 0.928760 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12777 MiB | 458150 GiB | 458149 GiB | | from large pool | 541440 KiB | 12712 MiB | 456448 GiB | 456447 GiB | | from small pool | 18003 KiB | 67 MiB | 1702 GiB | 1702 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12777 MiB | 458150 GiB | 458149 GiB | | from large pool | 541440 KiB | 12712 MiB | 456448 GiB | 456447 GiB | | from small pool | 18003 KiB | 67 MiB | 1702 GiB | 1702 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12774 MiB | 457760 GiB | 457760 GiB | | from large pool | 539228 KiB | 12708 MiB | 456067 GiB | 456066 GiB | | from small pool | 18002 KiB | 67 MiB | 1693 GiB | 1693 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 13058 MiB | 13058 MiB | 49978 MiB | 36920 MiB | | from large pool | 12988 MiB | 12988 MiB | 49724 MiB | 36736 MiB | | from small pool | 70 MiB | 70 MiB | 254 MiB | 184 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 186029 KiB | 403075 KiB | 80605 GiB | 80605 GiB | | from large pool | 179456 KiB | 396800 KiB | 78771 GiB | 78771 GiB | | from small pool | 6573 KiB | 43066 KiB | 1833 GiB | 1833 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1343 | 42260 K | 42259 K | | from large pool | 80 | 334 | 11805 K | 11805 K | | from small pool | 780 | 1235 | 30454 K | 30454 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1343 | 42260 K | 42259 K | | from large pool | 80 | 334 | 11805 K | 11805 K | | from small pool | 780 | 1235 | 30454 K | 30454 K | |---------------------------------------------------------------------------| | GPU reserved segments | 263 | 266 | 973 | 710 | | from large pool | 228 | 231 | 846 | 618 | | from small pool | 35 | 35 | 127 | 92 | |---------------------------------------------------------------------------| | Non-releasable allocs | 39 | 91 | 18617 K | 18617 K | | from large pool | 19 | 25 | 2343 K | 2343 K | | from small pool | 20 | 73 | 16273 K | 16273 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2876, |B|=0.1878, |∇A|=1.438e-05, |∇B|=1.287e-05, |LoRA(x)|=3.684e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2912, |B|=0.1936, |∇A|=6.889e-06, |∇B|=1.268e-05, |LoRA(x)|=4.189e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2807, |B|=0.1575, |∇A|=9.492e-06, |∇B|=1.722e-05, |LoRA(x)|=4.119e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2686, |B|=0.1646, |∇A|=1.672e-05, |∇B|=3.348e-05, |LoRA(x)|=2.419e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.3048, |B|=0.2006, |∇A|=2.493e-05, |∇B|=1.487e-05, |LoRA(x)|=9.104e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2678, |B|=0.1636, |∇A|=9.089e-06, |∇B|=3.795e-05, |LoRA(x)|=4.017e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2823, |B|=0.1744, |∇A|=1.387e-05, |∇B|=1.468e-05, |LoRA(x)|=2.939e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2715, |B|=0.179, |∇A|=7.818e-06, |∇B|=1.444e-05, |LoRA(x)|=3.56e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2631, |B|=0.147, |∇A|=1.15e-05, |∇B|=2.431e-05, |LoRA(x)|=4.313e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2673, |B|=0.1493, |∇A|=1.245e-05, |∇B|=3.251e-05, |LoRA(x)|=3.406e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2883, |B|=0.1864, |∇A|=1.843e-05, |∇B|=1.683e-05, |LoRA(x)|=9.673e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.2769, |B|=0.1594, |∇A|=1.305e-05, |∇B|=5.319e-05, |LoRA(x)|=2.091e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2743, |B|=0.1739, |∇A|=1.158e-05, |∇B|=1.88e-05, |LoRA(x)|=3.51e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2883, |B|=0.1874, |∇A|=1.342e-05, |∇B|=1.908e-05, |LoRA(x)|=4.053e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2556, |B|=0.124, |∇A|=7.334e-06, |∇B|=1.962e-05, |LoRA(x)|=5.071e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.262, |B|=0.1546, |∇A|=2.735e-05, |∇B|=4.336e-05, |LoRA(x)|=1.624e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.3036, |B|=0.2016, |∇A|=2.057e-05, |∇B|=1.898e-05, |LoRA(x)|=1.181e+05, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2657, |B|=0.1531, |∇A|=1.202e-05, |∇B|=5.005e-05, |LoRA(x)|=3.564e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2763, |B|=0.1892, |∇A|=1.221e-05, |∇B|=2.084e-05, |LoRA(x)|=3.428e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2921, |B|=0.1972, |∇A|=2.221e-05, |∇B|=2.332e-05, |LoRA(x)|=3.677e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.259, |B|=0.1371, |∇A|=1.384e-05, |∇B|=1.967e-05, |LoRA(x)|=3.364e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2623, |B|=0.1423, |∇A|=3.489e-05, |∇B|=3.26e-05, |LoRA(x)|=1.808e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2605, |B|=0.1675, |∇A|=2.439e-05, |∇B|=1.716e-05, |LoRA(x)|=6.808e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2554, |B|=0.1372, |∇A|=1.111e-05, |∇B|=4.632e-05, |LoRA(x)|=3.403e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2629, |B|=0.1879, |∇A|=9.482e-06, |∇B|=1.871e-05, |LoRA(x)|=3.945e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2739, |B|=0.1899, |∇A|=2.691e-05, |∇B|=2.026e-05, |LoRA(x)|=3.793e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2427, |B|=0.1229, |∇A|=9.548e-06, |∇B|=1.285e-05, |LoRA(x)|=3.232e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2637, |B|=0.1353, |∇A|=3.307e-05, |∇B|=2.627e-05, |LoRA(x)|=2.043e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2615, |B|=0.151, |∇A|=1.416e-05, |∇B|=8.211e-06, |LoRA(x)|=8.067e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2385, |B|=0.1152, |∇A|=1.275e-06, |∇B|=1.137e-05, |LoRA(x)|=1.052e+05, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2649, |B|=0.1849, |∇A|=5.999e-06, |∇B|=1.623e-05, |LoRA(x)|=6.015e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2639, |B|=0.1595, |∇A|=1.905e-05, |∇B|=9.632e-06, |LoRA(x)|=4.3e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2426, |B|=0.1004, |∇A|=7.553e-06, |∇B|=8.706e-06, |LoRA(x)|=3.525e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2455, |B|=0.1125, |∇A|=1.371e-05, |∇B|=1.414e-05, |LoRA(x)|=1.956e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.244, |B|=0.1313, |∇A|=6.42e-06, |∇B|=3.114e-06, |LoRA(x)|=6.599e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2351, |B|=0.1128, |∇A|=2.28e-07, |∇B|=5.508e-06, |LoRA(x)|=2.037e+05, B≠0=12288
Training Summary¶
Freezing then unfreezing layer 5 FFNs (in the hope other layers could catch up) actually didn't help
In [14]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.287240594625473 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18772827088832855 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2909419536590576 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19361740350723267 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.280423641204834 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574709117412567 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2683144807815552 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1646396517753601 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3044332265853882 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2005985826253891 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26756101846694946 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16362197697162628 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2821109890937805 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17438463866710663 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2712245583534241 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17899440228939056 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26285862922668457 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14696842432022095 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2671390771865845 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14929549396038055 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.28791946172714233 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1863688826560974 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27668440341949463 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15934498608112335 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2741093635559082 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17391812801361084 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28805288672447205 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1873616874217987 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25546205043792725 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12401476502418518 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26189106702804565 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15452931821346283 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.30333781242370605 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20153628289699554 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2654007375240326 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15304669737815857 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2759910225868225 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1892491579055786 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29188308119773865 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718174636363983 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2588292062282562 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704168796539307 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2620876431465149 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226451516151428 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.26023590564727783 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748926043510437 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25515085458755493 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720864057540894 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2625563144683838 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.18786773085594177 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2736532986164093 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18987171351909637 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2424963414669037 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12292399257421494 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2634068727493286 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13521724939346313 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2611706852912903 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097293257713318 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23863813281059265 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11519813537597656 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26431819796562195 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1849026381969452 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.26360854506492615 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1594821661710739 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24227461218833923 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.1004873663187027 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24558258056640625 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11250372976064682 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.2434004843235016 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1312120109796524 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.2350156307220459 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11283187568187714 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.5990 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.4484 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 41.0335 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1989 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.8708 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.4580 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.2828 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.3685 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.3373 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.7995 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.9791 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.3217 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.1453 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.7263 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.3354 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.1513 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.3621 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.9161 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1554 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.3214 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.6530 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.8055 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.4976 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.7980 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.8030 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5728 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.7285 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.4093 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4713 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.7762 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.4290 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 22.0014 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 43.0048 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.3172 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.4310 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 22.0275 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.1868 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.6021 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.4129 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7546 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.8722 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.7314 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.3789 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.2694 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.8746 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.5508 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.5625 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.6110 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.2517 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.5313 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.8101 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.8539 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.4397 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.5436 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.8973 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.4158 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.8792 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.9426 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.2702 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.7655 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.5755 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1897 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.4424 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.9606 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.5195 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.7072 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6669 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.2908 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.6663 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.2804 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7187 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5426 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.078249931335449 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9629707336425781 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0608773231506348 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9582805633544922 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.936693549156189 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.963030219078064 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9090875387191772 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9155340194702148 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.004789352416992 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9639787673950195 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9920339584350586 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.974652647972107 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9344266653060913 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9241001605987549 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9053921699523926 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9328597784042358 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0111687183380127 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9665979146957397 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0546875 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.987182855606079 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.920434832572937 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9206979274749756 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.900865912437439 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.908034324645996 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.02260160446167 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9500030279159546 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0796260833740234 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.943003535270691 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9345171451568604 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9029006958007812 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9271156787872314 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.919654130935669 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.04514741897583 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9204539060592651 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.066929340362549 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9324769973754883 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.829803466796875 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9489554166793823 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9196603298187256 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8496103286743164 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.94928777217865 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9072329998016357 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0761525630950928 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9125897884368896 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8967959880828857 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.904982566833496 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8897030353546143 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9557445049285889 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.5368 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.0002 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 58.0945 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8807 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3976 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 55.0108 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.5766 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7535 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.4095 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0503 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.9582 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.2007 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 54.1290 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8809 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.3206 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1953 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.3965 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 55.0122 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.7370 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.6521 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.7029 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8140 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.3594 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4964 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.8060 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.5576 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.4434 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5110 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.1719 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.3051 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.0121 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7271 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5159 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7450 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.1790 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0628 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6230 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3730 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0633 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0945 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.0960 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.4967 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 58.2587 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.5461 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3409 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3365 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2633 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5414 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.3918760418891907 distilbert.transformer.layer.0.attention.q_lin.m_in 0.2821083962917328 distilbert.transformer.layer.0.attention.k_lin.m_out 0.388096421957016 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2891788184642792 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2867240905761719 distilbert.transformer.layer.0.attention.v_lin.m_in 0.28405582904815674 distilbert.transformer.layer.0.attention.out_lin.m_out 0.26317834854125977 distilbert.transformer.layer.0.attention.out_lin.m_in 0.25337088108062744 distilbert.transformer.layer.1.attention.q_lin.m_out 0.3328227698802948 distilbert.transformer.layer.1.attention.q_lin.m_in 0.27658596634864807 distilbert.transformer.layer.1.attention.k_lin.m_out 0.3300029933452606 distilbert.transformer.layer.1.attention.k_lin.m_in 0.27019354701042175 distilbert.transformer.layer.1.attention.v_lin.m_out 0.2650151252746582 distilbert.transformer.layer.1.attention.v_lin.m_in 0.2554093599319458 distilbert.transformer.layer.1.attention.out_lin.m_out 0.25520753860473633 distilbert.transformer.layer.1.attention.out_lin.m_in 0.25274693965911865 distilbert.transformer.layer.2.attention.q_lin.m_out 0.32323920726776123 distilbert.transformer.layer.2.attention.q_lin.m_in 0.26473814249038696 distilbert.transformer.layer.2.attention.k_lin.m_out 0.3654788136482239 distilbert.transformer.layer.2.attention.k_lin.m_in 0.29941341280937195 distilbert.transformer.layer.2.attention.v_lin.m_out 0.24500706791877747 distilbert.transformer.layer.2.attention.v_lin.m_in 0.24774298071861267 distilbert.transformer.layer.2.attention.out_lin.m_out 0.26631924510002136 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23847880959510803 distilbert.transformer.layer.3.attention.q_lin.m_out 0.34433692693710327 distilbert.transformer.layer.3.attention.q_lin.m_in 0.2763220965862274 distilbert.transformer.layer.3.attention.k_lin.m_out 0.38676881790161133 distilbert.transformer.layer.3.attention.k_lin.m_in 0.28456243872642517 distilbert.transformer.layer.3.attention.v_lin.m_out 0.25427114963531494 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23790866136550903 distilbert.transformer.layer.3.attention.out_lin.m_out 0.2615028917789459 distilbert.transformer.layer.3.attention.out_lin.m_in 0.25258609652519226 distilbert.transformer.layer.4.attention.q_lin.m_out 0.35477954149246216 distilbert.transformer.layer.4.attention.q_lin.m_in 0.2452540099620819 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3896406590938568 distilbert.transformer.layer.4.attention.k_lin.m_in 0.25737807154655457 distilbert.transformer.layer.4.attention.v_lin.m_out 0.22937028110027313 distilbert.transformer.layer.4.attention.v_lin.m_in 0.24845485389232635 distilbert.transformer.layer.4.attention.out_lin.m_out 0.278006374835968 distilbert.transformer.layer.4.attention.out_lin.m_in 0.2266145646572113 distilbert.transformer.layer.5.attention.q_lin.m_out 0.32691702246665955 distilbert.transformer.layer.5.attention.q_lin.m_in 0.25372225046157837 distilbert.transformer.layer.5.attention.k_lin.m_out 0.38233980536460876 distilbert.transformer.layer.5.attention.k_lin.m_in 0.24921034276485443 distilbert.transformer.layer.5.attention.v_lin.m_out 0.25670918822288513 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22960709035396576 distilbert.transformer.layer.5.attention.out_lin.m_out 0.2548404633998871 distilbert.transformer.layer.5.attention.out_lin.m_in 0.22926835715770721 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.5111 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.3667 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.5175 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3886 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.3022 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2630 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.6783 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7998 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.8675 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2858 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.5865 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.6285 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.4300 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.7012 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 9.0223 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8704 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.3432 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.6139 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.5924 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.6472 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 9.0080 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6870 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.4817 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2836 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.9146 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.8234 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 13.1844 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.4047 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.1708 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2959 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.2903 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.7511 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.6141 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.1202 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.4419 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.4171 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4905 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1461 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0894 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.3265 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.9401 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5767 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.9022 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.2130 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3175 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0608 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.4061 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4106
In [16]:
import torch.nn.init as init
for name, module in model_ddora_all_attn.named_modules():
if "transformer.layer.5" in name and hasattr(module, "lora_A") and hasattr(module, "lora_B"):
print(f"Resetting LoRA weights in: {name}")
init.normal_(module.lora_A.weight, mean=0.0, std=0.01)
init.normal_(module.lora_B.weight, mean=0.0, std=0.01)
if hasattr(module, "m_in"):
init.normal_(module.m_in, mean=0.0, std=0.01)
if hasattr(module, "m_out"):
init.normal_(module.m_out, mean=0.0, std=0.01)
for name, module in model_ddora_all_attn.named_modules():
if "transformer.layer.5" in name and hasattr(module, "lora_A"):
A_std = module.lora_A.weight.std().item()
B_std = module.lora_B.weight.std().item()
print(f"{name}: |A|={module.lora_A.weight.abs().mean():.4f}, std={A_std:.4f} |B|={module.lora_B.weight.abs().mean():.4f}, std={B_std:.4f}")
In [17]:
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.287240594625473 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18772827088832855 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2909419536590576 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19361740350723267 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.280423641204834 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574709117412567 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2683144807815552 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1646396517753601 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3044332265853882 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2005985826253891 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26756101846694946 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16362197697162628 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2821109890937805 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17438463866710663 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2712245583534241 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17899440228939056 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26285862922668457 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14696842432022095 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2671390771865845 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14929549396038055 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.28791946172714233 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1863688826560974 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27668440341949463 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15934498608112335 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2741093635559082 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17391812801361084 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28805288672447205 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1873616874217987 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25546205043792725 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12401476502418518 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26189106702804565 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15452931821346283 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.30333781242370605 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20153628289699554 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2654007375240326 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15304669737815857 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2759910225868225 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1892491579055786 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29188308119773865 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718174636363983 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2588292062282562 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704168796539307 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2620876431465149 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226451516151428 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.26023590564727783 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748926043510437 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25515085458755493 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720864057540894 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2625563144683838 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.18786773085594177 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2736532986164093 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18987171351909637 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2424963414669037 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12292399257421494 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2634068727493286 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13521724939346313 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2611706852912903 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097293257713318 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23863813281059265 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11519813537597656 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26431819796562195 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1849026381969452 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.26360854506492615 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1594821661710739 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24227461218833923 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.1004873663187027 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24558258056640625 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11250372976064682 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.2434004843235016 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1312120109796524 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.2350156307220459 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11283187568187714
In [18]:
for name, module in model_ddora_all_attn.named_modules():
if "transformer.layer.5" in name:
if hasattr(module, "lora_A"):
print(f"{name} has lora_A")
if hasattr(module, "lora_B"):
print(f"{name} has lora_B")
In [19]:
# Manually traverse to layer 5
layer5 = model_ddora_all_attn.distilbert.transformer.layer[5]
# Now inspect submodules
for name, module in layer5.named_modules():
if hasattr(module, "lora_A"):
print(f"{name} has lora_A")
if hasattr(module, "lora_B"):
print(f"{name} has lora_B")
In [20]:
layer5 = model_ddora_all_attn.distilbert.transformer.layer[5]
for name, module in layer5.named_modules():
print(name, type(module))
for name, module in model_ddora_all_attn.named_modules():
if hasattr(module, "lora_A"):
print(name)
<class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'> attention <class 'transformers.models.distilbert.modeling_distilbert.DistilBertSdpaAttention'> attention.dropout <class 'torch.nn.modules.dropout.Dropout'> attention.q_lin <class '__main__.LinearWithDoubleDoRA'> attention.q_lin.linear <class 'torch.nn.modules.linear.Linear'> attention.q_lin.lora <class '__main__.LoRALayer'> attention.q_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> attention.k_lin <class '__main__.LinearWithDoubleDoRA'> attention.k_lin.linear <class 'torch.nn.modules.linear.Linear'> attention.k_lin.lora <class '__main__.LoRALayer'> attention.k_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> attention.v_lin <class '__main__.LinearWithDoubleDoRA'> attention.v_lin.linear <class 'torch.nn.modules.linear.Linear'> attention.v_lin.lora <class '__main__.LoRALayer'> attention.v_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> attention.out_lin <class '__main__.LinearWithDoubleDoRA'> attention.out_lin.linear <class 'torch.nn.modules.linear.Linear'> attention.out_lin.lora <class '__main__.LoRALayer'> attention.out_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> sa_layer_norm <class 'torch.nn.modules.normalization.LayerNorm'> ffn <class 'transformers.models.distilbert.modeling_distilbert.FFN'> ffn.dropout <class 'torch.nn.modules.dropout.Dropout'> ffn.lin1 <class '__main__.LinearWithDoubleDoRA'> ffn.lin1.linear <class 'torch.nn.modules.linear.Linear'> ffn.lin1.lora <class '__main__.LoRALayer'> ffn.lin1.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> ffn.lin2 <class '__main__.LinearWithDoubleDoRA'> ffn.lin2.linear <class 'torch.nn.modules.linear.Linear'> ffn.lin2.lora <class '__main__.LoRALayer'> ffn.lin2.lora.dropout <class 'torch.nn.modules.dropout.Dropout'> ffn.activation <class 'transformers.activations.GELUActivation'> output_layer_norm <class 'torch.nn.modules.normalization.LayerNorm'>
In [21]:
for name, module in layer5.named_modules():
if hasattr(module, "lora"):
if hasattr(module.lora, "lora_A"):
print(f"{name} has lora_A")
if hasattr(module.lora, "lora_B"):
print(f"{name} has lora_B")
In [22]:
for name, module in layer5.named_modules():
if hasattr(module, "lora"):
print(f"{name} has LoRA module with params:", list(vars(module.lora).keys()))
attention.q_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha'] attention.k_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha'] attention.v_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha'] attention.out_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha'] ffn.lin1 has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha'] ffn.lin2 has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
In [23]:
for name, module in layer5.named_modules():
if hasattr(module, "lora"):
lora_params = dict(module.lora.named_parameters())
if "lora_A" in lora_params:
print(f"{name} has lora_A")
if "lora_B" in lora_params:
print(f"{name} has lora_B")
In [ ]: