In [1]:
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print(torch.cuda.get_device_name(0))
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
import bitsandbytes
import peft
import transformers
print(transformers.__version__)
print(f"bitsandbytes version: {bitsandbytes.__version__}")
print(f"peft version: {peft.__version__}")
print(torch.cuda.is_bf16_supported())
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)] PyTorch Version: 2.5.1+cu121 True 1 CUDA Version: 12.1 NVIDIA GeForce RTX 4080 Laptop GPU 4.50.0.dev0 bitsandbytes version: 0.45.3 peft version: 0.15.2.dev0 True
In [2]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
imdb_dataset = imdb_dataset.rename_column("label", "labels")
# Split the test set into validation and test sets
test_val_split = imdb_dataset['test'].train_test_split(test_size=0.95, seed=42)
imdb_dataset['validation'] = test_val_split['train']
imdb_dataset['test'] = test_val_split['test']
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
# Determine the number of labels
num_labels = len(set(imdb_dataset["train"]["labels"]))
print(f"Number of labels: {num_labels}")
# Load the tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# Tokenize the whole dataset, truncate to 384 tokens
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, max_length=384)
dataset_encoded = imdb_dataset.map(tokenize, batched=True, batch_size=None)
# Load the pretrained model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))
#print(model)
Number of labels: 2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [3]:
# Helper functions
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
def count_trainable_parameters(model):
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total_params, trainable_params, 100 * trainable_params / total_params
def freeze_model_layers(model, unfreeze_pre_classifier=False):
# Freeze all parameters
for param in model.parameters():
param.requires_grad = False
# Unfreeze LoRA and DoRA-specific params, including lora_norm
for name, param in model.named_parameters():
if (
"lora.A" in name
or "lora.B" in name
or "lora_norm" in name
or name.endswith(".m") # For DoRA
or name.endswith(".m_in") # For DDoRA
or name.endswith(".m_out") # For DDoRA
or "scale" in name
):
param.requires_grad = True
# Unfreeze classifier layer (always)
for name, param in model.named_parameters():
if name.startswith("classifier."):
param.requires_grad = True
# unfreeze pre-classifier
if unfreeze_pre_classifier:
for name, param in model.named_parameters():
if name.startswith("pre_classifier."):
param.requires_grad = True
Double DoRA (DDoRA)¶
Double Weight-Decomposed Low-Rank Adaptation¶
In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)
class LoRALayer(nn.Module):
def __init__(self, in_dim, out_dim, rank, alpha, dropout_rate=0.0):
super().__init__()
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
self.B = nn.Parameter(1e-4 * torch.randn(rank, out_dim) * std_dev) # not all zeroes!
self.alpha = alpha
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
dropped = self.dropout(x @ self.A)
return self.alpha * (dropped @ self.B)
class LinearWithDoubleDoRA(nn.Module):
def __init__(self, linear, rank, alpha, scaling_factor=1.0):
super().__init__()
self.linear = linear
self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.m_out = nn.Parameter(torch.randn(1, linear.out_features) * std_dev)
self.m_in = nn.Parameter(torch.randn(linear.in_features, 1) * std_dev)
# Orthogonal initialization for m_out
#self.m_out = nn.Parameter(torch.empty(1, linear.out_features))
#nn.init.orthogonal_(self.m_out)
# Orthogonal initialization for m_in
#self.m_in = nn.Parameter(torch.empty(linear.in_features, 1))
#nn.init.orthogonal_(self.m_in)
self.scale_out = nn.Parameter(torch.full((1, linear.out_features), float(scaling_factor)))
self.scale_in = nn.Parameter(torch.full((linear.in_features, 1), float(scaling_factor)))
self.last_lora_output_norm = 0.0 # For monitoring
def forward(self, x):
scaled_x = x * self.scale_in.T * self.m_in.T
linear_output = self.linear(x)
lora_output = self.lora(scaled_x)
lora_output_norm = lora_output / (lora_output.norm(p=2, dim=1, keepdim=True) + 1e-9)
self.last_lora_output_norm = lora_output.norm(p=2, dim=-1).mean().item()
dora_modification = self.scale_out * self.m_out * lora_output_norm
return linear_output + dora_modification
def inject_ddora_all_attn(model, rank, alpha, scaling_factor=1.0, dropout_rate=0.0, disable_layers=None):
target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin", "ffn.lin1", "ffn.lin2"]
#target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"]
if disable_layers is None:
disable_layers = []
for name, module in model.named_modules():
if isinstance(module, nn.Linear) and any(layer in name for layer in target_layers):
# Try to extract layer index from names like "transformer.layer.4.attention.q_lin"
parts = name.split('.')
layer_idx = None
for i, part in enumerate(parts):
if part == "layer" and i + 1 < len(parts):
try:
layer_idx = int(parts[i + 1])
break
except ValueError:
pass
if layer_idx is not None and layer_idx in disable_layers:
continue
parent_name = name.rsplit('.', 1)[0]
parent_module = model.get_submodule(parent_name)
original_linear = getattr(parent_module, name.split('.')[-1])
ddora_layer = LinearWithDoubleDoRA(original_linear, rank, alpha, scaling_factor)
ddora_layer.lora.dropout = nn.Dropout(dropout_rate)
setattr(parent_module, name.split('.')[-1], ddora_layer)
return model
In [5]:
def monitor_lora_parameters(model, threshold=1e-7):
monitor = {
"A_abs_mean": [],
"B_abs_mean": [],
"A_grad_mean": [],
"B_grad_mean": [],
"lora_output_norm": [],
"B_nonzero_count": [],
}
hooks = []
for name, module in model.named_modules():
if hasattr(module, "lora") and hasattr(module.lora, "A") and hasattr(module.lora, "B"):
A_param = module.lora.A
B_param = module.lora.B
# Gradient hooks (directly on nn.Parameter)
if A_param.requires_grad:
hooks.append(A_param.register_hook(lambda grad, n=name: monitor["A_grad_mean"].append((n, grad.abs().mean().item()))))
if B_param.requires_grad:
hooks.append(B_param.register_hook(lambda grad, n=name: monitor["B_grad_mean"].append((n, grad.abs().mean().item()))))
# Forward hook for value stats
def forward_hook(mod, inp, out, n=name):
A_mean = mod.lora.A.abs().mean().item()
B_mean = mod.lora.B.abs().mean().item()
B_nnz = (mod.lora.B.abs() > threshold).sum().item()
monitor["A_abs_mean"].append((n, A_mean))
monitor["B_abs_mean"].append((n, B_mean))
monitor["B_nonzero_count"].append((n, B_nnz))
monitor["lora_output_norm"].append((n, mod.last_lora_output_norm))
hooks.append(module.register_forward_hook(forward_hook))
return hooks, monitor
from transformers import TrainingArguments
def monitor_gradients(model):
hooks = []
gradient_history = {}
for name, param in model.named_parameters():
if param.requires_grad:
gradient_history[name] = []
def get_hook(n): # capture the name immediately
def hook(grad):
gradient_history[n].append(grad.abs().mean().item())
return hook
hooks.append(param.register_hook(get_hook(name)))
return hooks, gradient_history
- This study investigates stability of LoRA based adaptor training on IMDb dataset. It draws on pure LoRA training (https://lzrdgreen.github.io/LLMs/LoRAonIMDB.html) on IMDb and the lessons learned there. With zero dropout ∣B∣ is nearly 3 orders of magnitude smaller than ∣A∣. Even though ∣∇B∣ is much larger than ∣∇A∣ learning is slow and inefficient as weight update is determined by the the product of BA (as B has small magnitude, B becomes a weak projector. So, despite big gradients, B doesn’t learn enough — most burden falls on A.). You can also see all 3 adaptors (LoRA, DoRA, and DDoRA - a natural development of the DoRA idea of directional scaling to both in- and -out layers) trained on IMDb with zero dropout here: https://lzrdgreen.github.io/LLMs/adapters.html
- As we saw in https://lzrdgreen.github.io/LLMs/LoRAonIMDB.html, increase of the dropout leads to significant change in training dynamics: with dropout = 40% (applied after projection with matrix A and before the final projection with matrix B) B is forced to adapt more robustly over time.
- It is easy to check that further increase in dropout leads to more adaptation of the matrix B, however as the LoRA's effective update is the product of BA, the training is destroyed by too much noise in the projection due to the matrix A.
In [6]:
learning_rate = 1e-2 #############
dropout = 0.3 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
output_dir_prefix = "finetuned-imdb-"
import copy
torch.manual_seed(137)
model_ddora_all_attn = copy.deepcopy(model)
model_ddora_all_attn = inject_ddora_all_attn(model_ddora_all_attn, lora_rank, lora_alpha, scaling_factor, dropout)
freeze_model_layers(model_ddora_all_attn, unfreeze_pre_classifier=True)
total_params_ddora, trainable_params_ddora, percentage_ddora = count_trainable_parameters(model_ddora_all_attn)
print(f"\nDDoRA (All Attention) - Total parameters: {total_params_ddora:,}")
print(f"DDoRA (All Attention) - Trainable parameters: {trainable_params_ddora:,} ({percentage_ddora:.2f}%)")
# Sanity check
#print("\nTrainable parameters after freezing:")
#for name, param in model_ddora_all_attn.named_parameters():
# if param.requires_grad:
# print(name)
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0, #####
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
DDoRA (All Attention) - Total parameters: 68,448,002 DDoRA (All Attention) - Trainable parameters: 2,085,122 (3.05%)
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:06:50, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.583200 | 0.284775 | 0.880800 | 0.880623 |
100 | 0.331500 | 0.318036 | 0.865600 | 0.865880 |
150 | 0.334500 | 0.290622 | 0.888800 | 0.888612 |
200 | 0.275800 | 0.277798 | 0.884800 | 0.885054 |
250 | 0.289800 | 0.259306 | 0.896800 | 0.896963 |
300 | 0.272700 | 0.256442 | 0.902400 | 0.901592 |
350 | 0.289100 | 0.249133 | 0.907200 | 0.906665 |
400 | 0.252400 | 0.246030 | 0.907200 | 0.907287 |
450 | 0.255500 | 0.275144 | 0.906400 | 0.905762 |
500 | 0.268000 | 0.227364 | 0.906400 | 0.906347 |
550 | 0.258200 | 0.225571 | 0.913600 | 0.913558 |
600 | 0.273100 | 0.239456 | 0.916000 | 0.916031 |
650 | 0.231700 | 0.223158 | 0.915200 | 0.915013 |
700 | 0.230700 | 0.221400 | 0.915200 | 0.915031 |
750 | 0.231000 | 0.268203 | 0.913600 | 0.912997 |
800 | 0.238900 | 0.264123 | 0.904000 | 0.903140 |
850 | 0.214300 | 0.211674 | 0.917600 | 0.917539 |
900 | 0.195900 | 0.220456 | 0.920000 | 0.919857 |
950 | 0.215500 | 0.249347 | 0.916000 | 0.915967 |
1000 | 0.208900 | 0.222482 | 0.918400 | 0.918332 |
1050 | 0.189600 | 0.200232 | 0.919200 | 0.919229 |
1100 | 0.208800 | 0.206135 | 0.911200 | 0.911267 |
1150 | 0.178700 | 0.208787 | 0.919200 | 0.919240 |
1200 | 0.187000 | 0.220926 | 0.918400 | 0.918518 |
1250 | 0.194600 | 0.200717 | 0.922400 | 0.922343 |
1300 | 0.209100 | 0.199653 | 0.921600 | 0.921575 |
1350 | 0.207200 | 0.199614 | 0.916800 | 0.916836 |
1400 | 0.193000 | 0.193851 | 0.924800 | 0.924822 |
1450 | 0.175600 | 0.203755 | 0.924000 | 0.923930 |
1500 | 0.183900 | 0.198105 | 0.924000 | 0.923930 |
1550 | 0.191100 | 0.196167 | 0.924000 | 0.923970 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 564070 KiB | 12301 MiB | 121040 GiB | 121039 GiB | | from large pool | 546048 KiB | 12236 MiB | 120553 GiB | 120552 GiB | | from small pool | 18022 KiB | 67 MiB | 487 GiB | 487 GiB | |---------------------------------------------------------------------------| | Active memory | 564070 KiB | 12301 MiB | 121040 GiB | 121039 GiB | | from large pool | 546048 KiB | 12236 MiB | 120553 GiB | 120552 GiB | | from small pool | 18022 KiB | 67 MiB | 487 GiB | 487 GiB | |---------------------------------------------------------------------------| | Requested memory | 561856 KiB | 12297 MiB | 120831 GiB | 120831 GiB | | from large pool | 543836 KiB | 12231 MiB | 120346 GiB | 120346 GiB | | from small pool | 18020 KiB | 67 MiB | 485 GiB | 485 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 12590 MiB | 12590 MiB | 12590 MiB | 0 B | | from large pool | 12520 MiB | 12520 MiB | 12520 MiB | 0 B | | from small pool | 70 MiB | 70 MiB | 70 MiB | 0 B | |---------------------------------------------------------------------------| | Non-releasable memory | 72858 KiB | 144757 KiB | 10543 GiB | 10543 GiB | | from large pool | 64256 KiB | 137344 KiB | 10019 GiB | 10019 GiB | | from small pool | 8602 KiB | 43066 KiB | 523 GiB | 523 GiB | |---------------------------------------------------------------------------| | Allocations | 868 | 1318 | 11614 K | 11613 K | | from large pool | 82 | 298 | 3067 K | 3067 K | | from small pool | 786 | 1235 | 8547 K | 8546 K | |---------------------------------------------------------------------------| | Active allocs | 868 | 1318 | 11614 K | 11613 K | | from large pool | 82 | 298 | 3067 K | 3067 K | | from small pool | 786 | 1235 | 8547 K | 8546 K | |---------------------------------------------------------------------------| | GPU reserved segments | 271 | 271 | 271 | 0 | | from large pool | 236 | 236 | 236 | 0 | | from small pool | 35 | 35 | 35 | 0 | |---------------------------------------------------------------------------| | Non-releasable allocs | 41 | 90 | 4965 K | 4965 K | | from large pool | 18 | 23 | 444 K | 444 K | | from small pool | 23 | 73 | 4520 K | 4520 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2544, |B|=0.1359, |∇A|=1.05e-05, |∇B|=1.63e-05, |LoRA(x)|=2.025e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2467, |B|=0.1347, |∇A|=5.453e-06, |∇B|=1.36e-05, |LoRA(x)|=2.095e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2444, |B|=0.1122, |∇A|=6.981e-06, |∇B|=2.181e-05, |LoRA(x)|=2.793e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2346, |B|=0.1088, |∇A|=1.251e-05, |∇B|=4.782e-05, |LoRA(x)|=1.586e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2488, |B|=0.1355, |∇A|=2.653e-05, |∇B|=1.821e-05, |LoRA(x)|=4.497e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2276, |B|=0.1085, |∇A|=6.454e-06, |∇B|=4.911e-05, |LoRA(x)|=2.049e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2437, |B|=0.117, |∇A|=8.031e-06, |∇B|=1.606e-05, |LoRA(x)|=2.241e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2425, |B|=0.132, |∇A|=8.43e-06, |∇B|=1.863e-05, |LoRA(x)|=1.773e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2266, |B|=0.107, |∇A|=1.311e-05, |∇B|=4.143e-05, |LoRA(x)|=1.756e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2345, |B|=0.1007, |∇A|=1.681e-05, |∇B|=6.981e-05, |LoRA(x)|=1.455e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2421, |B|=0.1354, |∇A|=2.222e-05, |∇B|=2.405e-05, |LoRA(x)|=4.663e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.243, |B|=0.1093, |∇A|=8.305e-06, |∇B|=6.969e-05, |LoRA(x)|=2.158e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2483, |B|=0.1337, |∇A|=1.414e-05, |∇B|=2.872e-05, |LoRA(x)|=2.192e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2465, |B|=0.1365, |∇A|=1.158e-05, |∇B|=2.203e-05, |LoRA(x)|=2.198e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2268, |B|=0.09255, |∇A|=6.758e-06, |∇B|=5.326e-05, |LoRA(x)|=3.84e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2352, |B|=0.1127, |∇A|=3.042e-05, |∇B|=8.393e-05, |LoRA(x)|=1.015e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.2533, |B|=0.1324, |∇A|=1.371e-05, |∇B|=2.008e-05, |LoRA(x)|=9.044e+04, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2372, |B|=0.1021, |∇A|=7.97e-06, |∇B|=7.575e-05, |LoRA(x)|=2.32e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2431, |B|=0.1419, |∇A|=1.03e-05, |∇B|=2.381e-05, |LoRA(x)|=2.253e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2587, |B|=0.146, |∇A|=1.849e-05, |∇B|=2.731e-05, |LoRA(x)|=2.278e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2391, |B|=0.1046, |∇A|=1.518e-05, |∇B|=4.536e-05, |LoRA(x)|=2.051e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2426, |B|=0.1088, |∇A|=4.642e-05, |∇B|=6.672e-05, |LoRA(x)|=1.067e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2369, |B|=0.1235, |∇A|=3.247e-05, |∇B|=3.287e-05, |LoRA(x)|=3.809e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2361, |B|=0.09859, |∇A|=8.055e-06, |∇B|=7.404e-05, |LoRA(x)|=2.655e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2393, |B|=0.1376, |∇A|=1.142e-05, |∇B|=3.362e-05, |LoRA(x)|=2.431e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2429, |B|=0.132, |∇A|=2.908e-05, |∇B|=2.764e-05, |LoRA(x)|=2.04e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2302, |B|=0.0967, |∇A|=9.844e-06, |∇B|=3.014e-05, |LoRA(x)|=2.18e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2426, |B|=0.09934, |∇A|=2.959e-05, |∇B|=7.185e-05, |LoRA(x)|=1.232e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2427, |B|=0.1143, |∇A|=1.822e-05, |∇B|=1.799e-05, |LoRA(x)|=4.323e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2287, |B|=0.09016, |∇A|=2.057e-06, |∇B|=5.98e-05, |LoRA(x)|=6.813e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2438, |B|=0.1335, |∇A|=8.323e-06, |∇B|=3.188e-05, |LoRA(x)|=3.349e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2443, |B|=0.1176, |∇A|=2.251e-05, |∇B|=1.555e-05, |LoRA(x)|=2.596e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2302, |B|=0.08092, |∇A|=5.421e-06, |∇B|=2.434e-05, |LoRA(x)|=2.488e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2377, |B|=0.08437, |∇A|=1.694e-05, |∇B|=7.318e-05, |LoRA(x)|=1.172e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2207, |B|=0.09195, |∇A|=4.348e-06, |∇B|=1.028e-05, |LoRA(x)|=6.974e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2169, |B|=0.08306, |∇A|=1.032e-06, |∇B|=5.664e-05, |LoRA(x)|=8.686e+04, B≠0=12288
Training summary¶
- With dropout=0.3: Dropout in LoRA path forces A to be noisy/incomplete, pushing B to compensate. Now, ‖B‖ ~ 50–60% of ‖A‖, across the board.
- ∇B consistently larger than ∇A, meaning B is: actively adapting, holding a representational burden, and no longer a passive post-multiplier. This is critical: dropout enables B to become a full partner in the learned LoRA basis, not a ghost. Dropout is essential for equalising the adaptation pressure on A and B.
- LoRA(x) magnitudes are really high but reasonable and not explosive. Further gorwth of |LoRA(x)| may destabilise training easily.
- In DDoRA we add: m_in, m_out, and directional scale factors. These interact nonlinearly - that increases the risk of instability even further. However training is stable so far: no explosive norms or vanishing gradients.
In [7]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2702498137950897 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.15954196453094482 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2613638639450073 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.15853147208690643 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.25769156217575073 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.130849689245224 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2441953420639038 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.12809059023857117 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2653345465660095 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.16109466552734375 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2382393330335617 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.12931501865386963 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2573601007461548 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.13697725534439087 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.25658535957336426 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.1549973338842392 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.23516318202018738 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12259270995855331 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2436894178390503 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.11648242175579071 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2548418641090393 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.15947595238685608 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.259360134601593 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13327041268348694 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2628241777420044 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15530811250209808 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.25873541831970215 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.15722517669200897 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23438052833080292 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.1039535403251648 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24704134464263916 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.13156524300575256 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2707127332687378 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1584470272064209 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.24950599670410156 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12168612331151962 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25530001521110535 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.16401663422584534 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.27414608001708984 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.17030774056911469 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2488800585269928 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.11997505277395248 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.25246256589889526 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.12312033027410507 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24574077129364014 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.1428002119064331 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2467063069343567 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11558239161968231 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.24940571188926697 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.15973596274852753 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.25356122851371765 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.15287962555885315 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.23811078071594238 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11120724678039551 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.25315380096435547 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11443768441677094 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2513493597507477 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13147471845149994 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.235763818025589 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.1030246838927269 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2538462281227112 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.15554296970367432 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.25260573625564575 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.13536706566810608 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2386242002248764 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09103643894195557 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.2460632026195526 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.0942901223897934 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22521410882472992 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.10391643643379211 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22027425467967987 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09275849908590317 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.1128 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.4404 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 36.8767 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.2844 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.3068 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.6921 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 34.4764 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.2422 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 37.7293 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.6011 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 67.8830 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.4862 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.3440 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.5374 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.2538 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 21.8859 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 32.9675 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.3118 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.3291 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.4920 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 35.8611 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.0163 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 73.6737 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.0634 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 36.9587 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.6903 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.3556 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 21.9942 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 32.8011 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.5148 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 34.9976 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.6687 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.4237 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.1276 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 70.7119 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.3970 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 35.9975 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.0900 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 38.8575 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 23.9777 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.1934 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.0103 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 35.7466 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.3941 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 34.6405 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.3837 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.2789 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.5210 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.1940 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.2929 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 35.7177 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.4201 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 33.8286 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 15.7936 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 35.9960 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.3585 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.2749 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.4493 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.5280 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.0055 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 35.6735 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 21.9784 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 35.6887 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.5168 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 33.7674 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 12.6754 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6987 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.3650 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.5712 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 29.7103 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 61.6965 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.7134 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0612637996673584 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9753992557525635 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.041314125061035 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9644596576690674 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9422067403793335 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9674242734909058 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9067790508270264 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9497487545013428 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.0170845985412598 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9822043180465698 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9795405864715576 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9653682708740234 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9267244338989258 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9673479795455933 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9100148677825928 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9623349905014038 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0313947200775146 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9576448202133179 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0266146659851074 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9557504653930664 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9229930639266968 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9482643604278564 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9133390188217163 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9484319686889648 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0091195106506348 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9489374160766602 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0385282039642334 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9662821292877197 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.91229248046875 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9263949394226074 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9312078952789307 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9349567890167236 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0537590980529785 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.926956057548523 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.062760829925537 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.95039963722229 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.899548888206482 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9275894165039062 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9300048351287842 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9213910102844238 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9875534772872925 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.948134183883667 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0664865970611572 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9336917400360107 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8987897634506226 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9277005195617676 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8940513134002686 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9552154541015625 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 57.7446 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2181 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.1671 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8663 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.2293 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9489 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2115 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3861 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.3383 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3214 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.3866 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8853 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.6964 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7598 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2538 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.7067 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.8258 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6692 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 56.7386 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.5777 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.6129 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.2336 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5210 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.3786 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.2645 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4216 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.0892 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9664 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.4031 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7915 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.9975 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 54.0654 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5925 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.8357 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.7800 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3869 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.2735 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.8089 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0365 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.7496 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7071 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.3873 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.7937 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.9497 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3837 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.8037 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.9260 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5427 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.342327356338501 distilbert.transformer.layer.0.attention.q_lin.m_in 0.2673478126525879 distilbert.transformer.layer.0.attention.k_lin.m_out 0.33858394622802734 distilbert.transformer.layer.0.attention.k_lin.m_in 0.25746065378189087 distilbert.transformer.layer.0.attention.v_lin.m_out 0.24844586849212646 distilbert.transformer.layer.0.attention.v_lin.m_in 0.2527827024459839 distilbert.transformer.layer.0.attention.out_lin.m_out 0.21927964687347412 distilbert.transformer.layer.0.attention.out_lin.m_in 0.22732524573802948 distilbert.transformer.layer.1.attention.q_lin.m_out 0.2916175127029419 distilbert.transformer.layer.1.attention.q_lin.m_in 0.24687734246253967 distilbert.transformer.layer.1.attention.k_lin.m_out 0.2892614006996155 distilbert.transformer.layer.1.attention.k_lin.m_in 0.2524157166481018 distilbert.transformer.layer.1.attention.v_lin.m_out 0.21986591815948486 distilbert.transformer.layer.1.attention.v_lin.m_in 0.23661664128303528 distilbert.transformer.layer.1.attention.out_lin.m_out 0.22159534692764282 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23196688294410706 distilbert.transformer.layer.2.attention.q_lin.m_out 0.32500144839286804 distilbert.transformer.layer.2.attention.q_lin.m_in 0.2491009682416916 distilbert.transformer.layer.2.attention.k_lin.m_out 0.31428322196006775 distilbert.transformer.layer.2.attention.k_lin.m_in 0.25061824917793274 distilbert.transformer.layer.2.attention.v_lin.m_out 0.2272852063179016 distilbert.transformer.layer.2.attention.v_lin.m_in 0.22342929244041443 distilbert.transformer.layer.2.attention.out_lin.m_out 0.2447393834590912 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23465190827846527 distilbert.transformer.layer.3.attention.q_lin.m_out 0.30937662720680237 distilbert.transformer.layer.3.attention.q_lin.m_in 0.24855463206768036 distilbert.transformer.layer.3.attention.k_lin.m_out 0.32985740900039673 distilbert.transformer.layer.3.attention.k_lin.m_in 0.2726486921310425 distilbert.transformer.layer.3.attention.v_lin.m_out 0.227847620844841 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23485350608825684 distilbert.transformer.layer.3.attention.out_lin.m_out 0.24030470848083496 distilbert.transformer.layer.3.attention.out_lin.m_in 0.24534587562084198 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3439795970916748 distilbert.transformer.layer.4.attention.q_lin.m_in 0.23196381330490112 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3551006317138672 distilbert.transformer.layer.4.attention.k_lin.m_in 0.23938599228858948 distilbert.transformer.layer.4.attention.v_lin.m_out 0.24515563249588013 distilbert.transformer.layer.4.attention.v_lin.m_in 0.23077671229839325 distilbert.transformer.layer.4.attention.out_lin.m_out 0.24853835999965668 distilbert.transformer.layer.4.attention.out_lin.m_in 0.22509190440177917 distilbert.transformer.layer.5.attention.q_lin.m_out 0.29305499792099 distilbert.transformer.layer.5.attention.q_lin.m_in 0.24855470657348633 distilbert.transformer.layer.5.attention.k_lin.m_out 0.3476855754852295 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23641684651374817 distilbert.transformer.layer.5.attention.v_lin.m_out 0.2526254951953888 distilbert.transformer.layer.5.attention.v_lin.m_in 0.22607746720314026 distilbert.transformer.layer.5.attention.out_lin.m_out 0.2052319496870041 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23453399538993835 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 11.6139 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.6670 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 11.5400 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.2944 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 8.8192 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.1880 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 7.8691 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.5758 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.1820 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 8.9425 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.2440 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.1733 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 7.9383 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.6212 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.1016 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.6380 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.2154 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 8.9758 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.0125 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.1251 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.3453 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.2017 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 8.8362 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.7451 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 10.8372 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.2071 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.4118 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.7081 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.3508 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 8.9906 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.6766 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.3606 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.0146 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.5764 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.1504 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.6597 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.9001 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.9268 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 9.0088 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 8.8844 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.5646 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.1838 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 11.7903 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.8605 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4302 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.7683 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.6707 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.8617
Set dropout to more reasonable 0.1¶
Once A and B are well-trained, we can reduce dropout (avoid dropout = 0.0 at this stage: this leads to all LoRA adaptors dying out):
Not to remove B’s role, but to let A+B synthesise.
This would allow the full LoRA (and DDoRA) path to operate unimpeded, leveraging both learned subspaces.
Use different LRs for lora.A and lora.B¶
In [8]:
def set_all_lora_dropout(model, new_dropout_rate):
for module in model.modules():
if isinstance(module, LoRALayer):
module.dropout.p = new_dropout_rate
def print_dropout_rates(model):
for name, module in model.named_modules():
if isinstance(module, LoRALayer):
print(f"{name}.dropout.p = {module.dropout.p}")
def split_lora_dora_params(model):
lora_A_params = []
lora_B_params = []
m_params = []
scale_params = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if "lora.A" in name:
lora_A_params.append(param)
elif "lora.B" in name:
lora_B_params.append(param)
elif name.endswith("m_in") or name.endswith("m_out"):
m_params.append(param)
elif "scale" in name:
scale_params.append(param)
return {
"lora_A": lora_A_params,
"lora_B": lora_B_params,
"m": m_params,
"scale": scale_params,
}
def create_custom_optimizer(model, base_lr=1e-4, lr_B_scale=10.0, lr_scale_params=0.2, weight_decay=0.01):
param_groups = split_lora_dora_params(model)
optimizer = torch.optim.AdamW([
{"params": param_groups["lora_A"], "lr": base_lr},
{"params": param_groups["lora_B"], "lr": base_lr * lr_B_scale},
{"params": param_groups["m"], "lr": base_lr},
{"params": param_groups["scale"], "lr": base_lr * lr_scale_params},
], weight_decay=weight_decay)
return optimizer
In [9]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=3e-3, ###########
lr_B_scale=0.5, #############
lr_scale_params=0.75, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
# Example output
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 2:18:14, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.190300 | 0.189563 | 0.926400 | 0.926298 |
100 | 0.154800 | 0.200476 | 0.923200 | 0.923212 |
150 | 0.168100 | 0.203508 | 0.928000 | 0.927841 |
200 | 0.126000 | 0.215565 | 0.924000 | 0.923856 |
250 | 0.144500 | 0.201292 | 0.924000 | 0.923902 |
300 | 0.145700 | 0.195318 | 0.919200 | 0.919206 |
350 | 0.153700 | 0.189462 | 0.925600 | 0.925583 |
400 | 0.116700 | 0.207675 | 0.923200 | 0.923163 |
450 | 0.128700 | 0.215511 | 0.920000 | 0.920012 |
500 | 0.138200 | 0.208557 | 0.923200 | 0.923163 |
550 | 0.142200 | 0.205139 | 0.926400 | 0.926188 |
600 | 0.146700 | 0.206250 | 0.920800 | 0.920666 |
650 | 0.111200 | 0.211345 | 0.924000 | 0.923930 |
700 | 0.109600 | 0.223709 | 0.923200 | 0.922996 |
750 | 0.125400 | 0.221195 | 0.921600 | 0.921427 |
800 | 0.122200 | 0.213449 | 0.920000 | 0.920012 |
850 | 0.093700 | 0.222432 | 0.922400 | 0.922356 |
900 | 0.079700 | 0.247886 | 0.923200 | 0.922996 |
950 | 0.093800 | 0.240048 | 0.920000 | 0.919873 |
1000 | 0.085900 | 0.235864 | 0.923200 | 0.923188 |
1050 | 0.096600 | 0.234580 | 0.924800 | 0.924666 |
1100 | 0.107200 | 0.225129 | 0.923200 | 0.923136 |
1150 | 0.079500 | 0.227587 | 0.925600 | 0.925532 |
1200 | 0.097200 | 0.227462 | 0.924800 | 0.924788 |
1250 | 0.111300 | 0.223540 | 0.924000 | 0.923957 |
1300 | 0.119000 | 0.221720 | 0.924000 | 0.923916 |
1350 | 0.132400 | 0.215398 | 0.928800 | 0.928694 |
1400 | 0.132600 | 0.211440 | 0.924000 | 0.923944 |
1450 | 0.140300 | 0.210785 | 0.924800 | 0.924764 |
1500 | 0.149600 | 0.210460 | 0.926400 | 0.926326 |
1550 | 0.156600 | 0.209057 | 0.924800 | 0.924751 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 559443 KiB | 12301 MiB | 242076 GiB | 242076 GiB | | from large pool | 541440 KiB | 12236 MiB | 241101 GiB | 241101 GiB | | from small pool | 18003 KiB | 67 MiB | 974 GiB | 974 GiB | |---------------------------------------------------------------------------| | Active memory | 559443 KiB | 12301 MiB | 242076 GiB | 242076 GiB | | from large pool | 541440 KiB | 12236 MiB | 241101 GiB | 241101 GiB | | from small pool | 18003 KiB | 67 MiB | 974 GiB | 974 GiB | |---------------------------------------------------------------------------| | Requested memory | 557230 KiB | 12297 MiB | 241659 GiB | 241659 GiB | | from large pool | 539228 KiB | 12231 MiB | 240689 GiB | 240689 GiB | | from small pool | 18002 KiB | 67 MiB | 970 GiB | 970 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 12554 MiB | 12590 MiB | 24540 MiB | 11986 MiB | | from large pool | 12484 MiB | 12520 MiB | 24408 MiB | 11924 MiB | | from small pool | 70 MiB | 70 MiB | 132 MiB | 62 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 75437 KiB | 146440 KiB | 21097 GiB | 21097 GiB | | from large pool | 68864 KiB | 141056 KiB | 20048 GiB | 20048 GiB | | from small pool | 6573 KiB | 43066 KiB | 1049 GiB | 1049 GiB | |---------------------------------------------------------------------------| | Allocations | 860 | 1318 | 23222 K | 23222 K | | from large pool | 80 | 298 | 6133 K | 6133 K | | from small pool | 780 | 1235 | 17089 K | 17088 K | |---------------------------------------------------------------------------| | Active allocs | 860 | 1318 | 23222 K | 23222 K | | from large pool | 80 | 298 | 6133 K | 6133 K | | from small pool | 780 | 1235 | 17089 K | 17088 K | |---------------------------------------------------------------------------| | GPU reserved segments | 270 | 271 | 515 | 245 | | from large pool | 235 | 236 | 449 | 214 | | from small pool | 35 | 35 | 66 | 31 | |---------------------------------------------------------------------------| | Non-releasable allocs | 38 | 90 | 10044 K | 10044 K | | from large pool | 18 | 24 | 890 K | 890 K | | from small pool | 20 | 73 | 9153 K | 9153 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2775, |B|=0.1617, |∇A|=2.004e-05, |∇B|=1.97e-05, |LoRA(x)|=2.146e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2666, |B|=0.1605, |∇A|=8.605e-06, |∇B|=1.567e-05, |LoRA(x)|=2.343e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2632, |B|=0.134, |∇A|=1.352e-05, |∇B|=2.356e-05, |LoRA(x)|=2.373e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2496, |B|=0.1311, |∇A|=2.109e-05, |∇B|=5.005e-05, |LoRA(x)|=1.2e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2715, |B|=0.1632, |∇A|=3.528e-05, |∇B|=1.96e-05, |LoRA(x)|=5.078e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.243, |B|=0.1318, |∇A|=1.097e-05, |∇B|=5.569e-05, |LoRA(x)|=1.701e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2634, |B|=0.1393, |∇A|=1.438e-05, |∇B|=1.748e-05, |LoRA(x)|=1.95e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2621, |B|=0.1571, |∇A|=1.328e-05, |∇B|=2.068e-05, |LoRA(x)|=1.912e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.2403, |B|=0.1252, |∇A|=1.612e-05, |∇B|=3.199e-05, |LoRA(x)|=1.786e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2473, |B|=0.1195, |∇A|=2.096e-05, |∇B|=5.373e-05, |LoRA(x)|=1.431e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2604, |B|=0.1617, |∇A|=3.089e-05, |∇B|=2.576e-05, |LoRA(x)|=4.777e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.264, |B|=0.1358, |∇A|=1.869e-05, |∇B|=7.984e-05, |LoRA(x)|=1.356e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2679, |B|=0.1572, |∇A|=1.893e-05, |∇B|=2.563e-05, |LoRA(x)|=2.383e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2658, |B|=0.1594, |∇A|=1.879e-05, |∇B|=2.583e-05, |LoRA(x)|=2.269e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.2404, |B|=0.1073, |∇A|=6.702e-06, |∇B|=2.483e-05, |LoRA(x)|=4.308e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2511, |B|=0.1343, |∇A|=3.909e-05, |∇B|=6.579e-05, |LoRA(x)|=1.082e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.2761, |B|=0.1607, |∇A|=2.807e-05, |∇B|=2.521e-05, |LoRA(x)|=6.752e+04, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2535, |B|=0.1248, |∇A|=1.891e-05, |∇B|=7.665e-05, |LoRA(x)|=1.603e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2614, |B|=0.1661, |∇A|=1.345e-05, |∇B|=2.416e-05, |LoRA(x)|=2.479e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.279, |B|=0.1721, |∇A|=2.81e-05, |∇B|=3.036e-05, |LoRA(x)|=2.496e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2532, |B|=0.1227, |∇A|=1.519e-05, |∇B|=2.587e-05, |LoRA(x)|=2.351e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2567, |B|=0.1261, |∇A|=4.712e-05, |∇B|=3.786e-05, |LoRA(x)|=1.333e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.2508, |B|=0.1454, |∇A|=2.706e-05, |∇B|=2.267e-05, |LoRA(x)|=4.743e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.2494, |B|=0.1191, |∇A|=1.207e-05, |∇B|=4.559e-05, |LoRA(x)|=2.457e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.2541, |B|=0.162, |∇A|=1.095e-05, |∇B|=2.543e-05, |LoRA(x)|=3.06e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2586, |B|=0.1548, |∇A|=2.646e-05, |∇B|=2.163e-05, |LoRA(x)|=2.408e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.242, |B|=0.1145, |∇A|=1.132e-05, |∇B|=1.651e-05, |LoRA(x)|=2.372e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2577, |B|=0.1182, |∇A|=3.214e-05, |∇B|=3.197e-05, |LoRA(x)|=1.449e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2556, |B|=0.1342, |∇A|=1.524e-05, |∇B|=1.086e-05, |LoRA(x)|=5.044e+04, B≠0=49152 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2387, |B|=0.1068, |∇A|=2.781e-06, |∇B|=1.757e-05, |LoRA(x)|=6.574e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2589, |B|=0.1577, |∇A|=1.034e-05, |∇B|=2.488e-05, |LoRA(x)|=3.877e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2566, |B|=0.1382, |∇A|=2.165e-05, |∇B|=1.07e-05, |LoRA(x)|=3.278e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2421, |B|=0.09396, |∇A|=8.988e-06, |∇B|=9.457e-06, |LoRA(x)|=2.753e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.25, |B|=0.0986, |∇A|=1.875e-05, |∇B|=1.969e-05, |LoRA(x)|=1.393e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2301, |B|=0.107, |∇A|=4.039e-06, |∇B|=3.966e-06, |LoRA(x)|=7.941e+04, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2232, |B|=0.09474, |∇A|=2.928e-07, |∇B|=6.926e-06, |LoRA(x)|=1.233e+05, B≠0=12288
Training summary¶
- Gradients aren't collapsing: ∇A and ∇B are both healthy (all gradient norms in the ~1e-5 to 7e-5 range) — no vanishing or explosion.
- B norms are consistently smaller than A (~0.22–0.28 range (A) and ~0.1–0.17 range (B)), and we applied lr_B_scale = 0.5, which kept B updates more conservative
- |LoRA(x)| is largest in FFN layers. Layer 5 shows larger activations in LoRA attention paths, especially in q_lin. Even worse: Layer 5: |∇B| = 2.488e-05 > Layer 0: 1.97e-05 - which means Layer 5 is still actively training. This makes freezing of layer 5 weights premature, - in the next section. Please see it as demo only, or retrain the model yourself without layer 5 freezing!
In [10]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.27467310428619385 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.16087943315505981 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.26493778824806213 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.15994273126125336 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.26134049892425537 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.13279186189174652 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.24835559725761414 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.13021254539489746 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2695589065551758 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.1625066101551056 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2408287227153778 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.13101235032081604 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2614685893058777 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.1385265290737152 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.260200560092926 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15645989775657654 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2383284568786621 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12416227906942368 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.24609380960464478 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.1184653490781784 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2583957612514496 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.16097836196422577 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.26195210218429565 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13490238785743713 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.26605361700057983 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15654103457927704 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.2632990777492523 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1587207019329071 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23798131942749023 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.1060485690832138 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24978193640708923 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.1334354281425476 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2741438150405884 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1599275767803192 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.251351535320282 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12355349957942963 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25958922505378723 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.16537396609783173 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2771756947040558 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.17137376964092255 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2521096467971802 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.12177401036024094 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2551520764827728 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1251428872346878 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24903497099876404 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.14453813433647156 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.24802027642726898 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11787533760070801 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.25245609879493713 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16117650270462036 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2567717432975769 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.15406717360019684 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24032878875732422 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11345615983009338 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2565303146839142 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11726316064596176 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2542564272880554 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13327965140342712 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2369489073753357 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.10564354062080383 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.25673991441726685 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1569884866476059 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2547406554222107 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1376175880432129 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24085545539855957 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09300228208303452 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24862870573997498 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09722810983657837 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22779500484466553 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1058943122625351 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22217977046966553 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09405812621116638 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.6747 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.6266 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.3332 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.4854 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.7740 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.9591 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 35.0878 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.5062 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 38.2489 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.9634 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 68.6745 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.7223 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.8761 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.7546 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.6833 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 22.0810 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 33.4225 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.5674 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.6941 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.7691 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 36.3546 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.4047 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 74.4587 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.2996 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.3979 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.8745 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.9386 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.1939 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.2852 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.8303 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 35.3576 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.9281 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.8654 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.5287 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 71.2674 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.6604 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.5726 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.2764 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 39.2914 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 24.1566 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.6481 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.2679 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.1505 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.6628 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 35.1100 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.8434 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.7395 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.8413 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.6301 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.4937 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 36.1941 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.6074 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.1606 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 16.1095 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.5053 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.7102 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.7436 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.9658 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.8764 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.3375 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 36.0503 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 22.1811 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 36.0766 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.7476 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.1035 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 13.0682 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 35.0975 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.7873 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.9677 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 30.2731 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 62.3751 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.9422 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.079176187515259 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.976986289024353 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0598769187927246 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.963384985923767 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9461816549301147 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9646905660629272 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9072974920272827 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9477274417877197 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.02653169631958 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9823664426803589 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9907326698303223 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9645729064941406 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9299085140228271 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.964238166809082 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9065513610839844 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.956520915031433 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.037914752960205 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9566318988800049 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.038102865219116 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.958008050918579 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9164390563964844 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9445397853851318 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9135068655014038 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9422650337219238 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0179243087768555 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9496374130249023 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0496463775634766 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9652724266052246 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9071924686431885 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.922563910484314 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9339063167572021 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9289063215255737 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0540103912353516 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.921581506729126 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.065849781036377 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9476207494735718 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8906574249267578 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9181361198425293 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.92246413230896 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9140986204147339 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.989418387413025 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9430391788482666 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071225643157959 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9252580404281616 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8846511840820312 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9164375066757202 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.880750060081482 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9473273754119873 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.2769 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2697 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7125 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8566 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3819 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.8937 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2645 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3593 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.6280 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3418 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7236 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8762 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.8090 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.6914 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.1904 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.5742 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 57.0324 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6576 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.0930 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.6503 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.4667 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.1525 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5581 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.2392 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.5301 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4601 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.4297 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9576 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.2929 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7103 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.1205 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.9308 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.6259 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7231 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.8851 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3364 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.0858 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.5845 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.8456 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.5895 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7921 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.2675 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9446 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.7487 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.0761 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.5358 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.6063 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.3667 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.3690018653869629 distilbert.transformer.layer.0.attention.q_lin.m_in 0.27579522132873535 distilbert.transformer.layer.0.attention.k_lin.m_out 0.3665243089199066 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2646521329879761 distilbert.transformer.layer.0.attention.v_lin.m_out 0.26364845037460327 distilbert.transformer.layer.0.attention.v_lin.m_in 0.25755858421325684 distilbert.transformer.layer.0.attention.out_lin.m_out 0.23036295175552368 distilbert.transformer.layer.0.attention.out_lin.m_in 0.2361956685781479 distilbert.transformer.layer.1.attention.q_lin.m_out 0.30955392122268677 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2551559805870056 distilbert.transformer.layer.1.attention.k_lin.m_out 0.3091847002506256 distilbert.transformer.layer.1.attention.k_lin.m_in 0.2579241394996643 distilbert.transformer.layer.1.attention.v_lin.m_out 0.23248517513275146 distilbert.transformer.layer.1.attention.v_lin.m_in 0.24111609160900116 distilbert.transformer.layer.1.attention.out_lin.m_out 0.22805188596248627 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23579005897045135 distilbert.transformer.layer.2.attention.q_lin.m_out 0.33838027715682983 distilbert.transformer.layer.2.attention.q_lin.m_in 0.25477975606918335 distilbert.transformer.layer.2.attention.k_lin.m_out 0.3349052369594574 distilbert.transformer.layer.2.attention.k_lin.m_in 0.26004013419151306 distilbert.transformer.layer.2.attention.v_lin.m_out 0.23051267862319946 distilbert.transformer.layer.2.attention.v_lin.m_in 0.22866296768188477 distilbert.transformer.layer.2.attention.out_lin.m_out 0.25453248620033264 distilbert.transformer.layer.2.attention.out_lin.m_in 0.2382659763097763 distilbert.transformer.layer.3.attention.q_lin.m_out 0.32573193311691284 distilbert.transformer.layer.3.attention.q_lin.m_in 0.2568552494049072 distilbert.transformer.layer.3.attention.k_lin.m_out 0.3485237956047058 distilbert.transformer.layer.3.attention.k_lin.m_in 0.27685362100601196 distilbert.transformer.layer.3.attention.v_lin.m_out 0.23205050826072693 distilbert.transformer.layer.3.attention.v_lin.m_in 0.23978719115257263 distilbert.transformer.layer.3.attention.out_lin.m_out 0.25406739115715027 distilbert.transformer.layer.3.attention.out_lin.m_in 0.2500074505805969 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3506551682949066 distilbert.transformer.layer.4.attention.q_lin.m_in 0.23591524362564087 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3639422357082367 distilbert.transformer.layer.4.attention.k_lin.m_in 0.2439534068107605 distilbert.transformer.layer.4.attention.v_lin.m_out 0.24837157130241394 distilbert.transformer.layer.4.attention.v_lin.m_in 0.2317391335964203 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2500099837779999 distilbert.transformer.layer.4.attention.out_lin.m_in 0.22958096861839294 distilbert.transformer.layer.5.attention.q_lin.m_out 0.3036739230155945 distilbert.transformer.layer.5.attention.q_lin.m_in 0.251531720161438 distilbert.transformer.layer.5.attention.k_lin.m_out 0.3580213785171509 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23563992977142334 distilbert.transformer.layer.5.attention.v_lin.m_out 0.25412318110466003 distilbert.transformer.layer.5.attention.v_lin.m_in 0.2263183444738388 distilbert.transformer.layer.5.attention.out_lin.m_out 0.20213589072227478 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23632609844207764 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 12.4339 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.8294 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 12.3191 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4829 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.2940 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3087 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.2266 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8123 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.6891 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.1400 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.8123 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.3261 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.2857 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.7601 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.2666 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7566 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.6090 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.1298 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.6100 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.3270 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.4656 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.3839 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.1117 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.8582 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.2639 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.3725 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.9785 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.8492 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.4670 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1364 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.1048 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.4892 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.2100 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.7015 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.3922 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.8146 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.9844 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.9990 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 8.9444 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.0421 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.8569 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.2505 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.0845 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.9086 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4463 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.8336 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.7024 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.9938
In [11]:
def freeze_ddora_layer(model, layer_idx):
target_prefix = f"distilbert.transformer.layer.{layer_idx}."
for name, param in model.named_parameters():
if name.startswith(target_prefix):
param.requires_grad = False
for i in range(6):
lin1 = model_ddora_all_attn.distilbert.transformer.layer[i].ffn.lin1
lin2 = model_ddora_all_attn.distilbert.transformer.layer[i].ffn.lin2
for param in lin1.parameters():
param.requires_grad = False
for param in lin2.parameters():
param.requires_grad = False
freeze_ddora_layer(model_ddora_all_attn, layer_idx=5)
for name, param in model_ddora_all_attn.named_parameters():
if param.requires_grad:
print(name)
print_dropout_rates(model_ddora_all_attn)
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#(likely due to unregularised overfitting and gradient collapse on low-magnitude params)
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)
dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############
from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"
training_args_ddora_all_attn = TrainingArguments(
output_dir=f"{output_dir_prefix}lora-all-attn",
num_train_epochs=2,
#max_steps=200,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=weight_decay,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
disable_tqdm=False,
push_to_hub=False,
max_grad_norm=1.0,
report_to="none",
log_level="error"
)
trainer_ddora_all_attn = Trainer(
model=model_ddora_all_attn,
args=training_args_ddora_all_attn,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
compute_metrics=compute_metrics,
)
trainer_ddora_all_attn.optimizer = create_custom_optimizer(
trainer_ddora_all_attn.model,
base_lr=1e-3, ###########
lr_B_scale=1.0, #############
lr_scale_params=1.0, #########
weight_decay=1e-5,
)
hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)
#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())
#for hook in hooks2:
# hook.remove()
#for name, grads in gradient_history2.items():
# print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")
for hook in hooks1:
hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
grouped = defaultdict(list)
for name, val in vals:
grouped[name].append(val)
agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}
for name in agg["A_abs_mean"]:
print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")
#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.m_out distilbert.transformer.layer.0.attention.q_lin.m_in distilbert.transformer.layer.0.attention.q_lin.scale_out distilbert.transformer.layer.0.attention.q_lin.scale_in distilbert.transformer.layer.0.attention.q_lin.lora.A distilbert.transformer.layer.0.attention.q_lin.lora.B distilbert.transformer.layer.0.attention.k_lin.m_out distilbert.transformer.layer.0.attention.k_lin.m_in distilbert.transformer.layer.0.attention.k_lin.scale_out distilbert.transformer.layer.0.attention.k_lin.scale_in distilbert.transformer.layer.0.attention.k_lin.lora.A distilbert.transformer.layer.0.attention.k_lin.lora.B distilbert.transformer.layer.0.attention.v_lin.m_out distilbert.transformer.layer.0.attention.v_lin.m_in distilbert.transformer.layer.0.attention.v_lin.scale_out distilbert.transformer.layer.0.attention.v_lin.scale_in distilbert.transformer.layer.0.attention.v_lin.lora.A distilbert.transformer.layer.0.attention.v_lin.lora.B distilbert.transformer.layer.0.attention.out_lin.m_out distilbert.transformer.layer.0.attention.out_lin.m_in distilbert.transformer.layer.0.attention.out_lin.scale_out distilbert.transformer.layer.0.attention.out_lin.scale_in distilbert.transformer.layer.0.attention.out_lin.lora.A distilbert.transformer.layer.0.attention.out_lin.lora.B distilbert.transformer.layer.1.attention.q_lin.m_out distilbert.transformer.layer.1.attention.q_lin.m_in distilbert.transformer.layer.1.attention.q_lin.scale_out distilbert.transformer.layer.1.attention.q_lin.scale_in distilbert.transformer.layer.1.attention.q_lin.lora.A distilbert.transformer.layer.1.attention.q_lin.lora.B distilbert.transformer.layer.1.attention.k_lin.m_out distilbert.transformer.layer.1.attention.k_lin.m_in distilbert.transformer.layer.1.attention.k_lin.scale_out distilbert.transformer.layer.1.attention.k_lin.scale_in distilbert.transformer.layer.1.attention.k_lin.lora.A distilbert.transformer.layer.1.attention.k_lin.lora.B distilbert.transformer.layer.1.attention.v_lin.m_out distilbert.transformer.layer.1.attention.v_lin.m_in distilbert.transformer.layer.1.attention.v_lin.scale_out distilbert.transformer.layer.1.attention.v_lin.scale_in distilbert.transformer.layer.1.attention.v_lin.lora.A distilbert.transformer.layer.1.attention.v_lin.lora.B distilbert.transformer.layer.1.attention.out_lin.m_out distilbert.transformer.layer.1.attention.out_lin.m_in distilbert.transformer.layer.1.attention.out_lin.scale_out distilbert.transformer.layer.1.attention.out_lin.scale_in distilbert.transformer.layer.1.attention.out_lin.lora.A distilbert.transformer.layer.1.attention.out_lin.lora.B distilbert.transformer.layer.2.attention.q_lin.m_out distilbert.transformer.layer.2.attention.q_lin.m_in distilbert.transformer.layer.2.attention.q_lin.scale_out distilbert.transformer.layer.2.attention.q_lin.scale_in distilbert.transformer.layer.2.attention.q_lin.lora.A distilbert.transformer.layer.2.attention.q_lin.lora.B distilbert.transformer.layer.2.attention.k_lin.m_out distilbert.transformer.layer.2.attention.k_lin.m_in distilbert.transformer.layer.2.attention.k_lin.scale_out distilbert.transformer.layer.2.attention.k_lin.scale_in distilbert.transformer.layer.2.attention.k_lin.lora.A distilbert.transformer.layer.2.attention.k_lin.lora.B distilbert.transformer.layer.2.attention.v_lin.m_out distilbert.transformer.layer.2.attention.v_lin.m_in distilbert.transformer.layer.2.attention.v_lin.scale_out distilbert.transformer.layer.2.attention.v_lin.scale_in distilbert.transformer.layer.2.attention.v_lin.lora.A distilbert.transformer.layer.2.attention.v_lin.lora.B distilbert.transformer.layer.2.attention.out_lin.m_out distilbert.transformer.layer.2.attention.out_lin.m_in distilbert.transformer.layer.2.attention.out_lin.scale_out distilbert.transformer.layer.2.attention.out_lin.scale_in distilbert.transformer.layer.2.attention.out_lin.lora.A distilbert.transformer.layer.2.attention.out_lin.lora.B distilbert.transformer.layer.3.attention.q_lin.m_out distilbert.transformer.layer.3.attention.q_lin.m_in distilbert.transformer.layer.3.attention.q_lin.scale_out distilbert.transformer.layer.3.attention.q_lin.scale_in distilbert.transformer.layer.3.attention.q_lin.lora.A distilbert.transformer.layer.3.attention.q_lin.lora.B distilbert.transformer.layer.3.attention.k_lin.m_out distilbert.transformer.layer.3.attention.k_lin.m_in distilbert.transformer.layer.3.attention.k_lin.scale_out distilbert.transformer.layer.3.attention.k_lin.scale_in distilbert.transformer.layer.3.attention.k_lin.lora.A distilbert.transformer.layer.3.attention.k_lin.lora.B distilbert.transformer.layer.3.attention.v_lin.m_out distilbert.transformer.layer.3.attention.v_lin.m_in distilbert.transformer.layer.3.attention.v_lin.scale_out distilbert.transformer.layer.3.attention.v_lin.scale_in distilbert.transformer.layer.3.attention.v_lin.lora.A distilbert.transformer.layer.3.attention.v_lin.lora.B distilbert.transformer.layer.3.attention.out_lin.m_out distilbert.transformer.layer.3.attention.out_lin.m_in distilbert.transformer.layer.3.attention.out_lin.scale_out distilbert.transformer.layer.3.attention.out_lin.scale_in distilbert.transformer.layer.3.attention.out_lin.lora.A distilbert.transformer.layer.3.attention.out_lin.lora.B distilbert.transformer.layer.4.attention.q_lin.m_out distilbert.transformer.layer.4.attention.q_lin.m_in distilbert.transformer.layer.4.attention.q_lin.scale_out distilbert.transformer.layer.4.attention.q_lin.scale_in distilbert.transformer.layer.4.attention.q_lin.lora.A distilbert.transformer.layer.4.attention.q_lin.lora.B distilbert.transformer.layer.4.attention.k_lin.m_out distilbert.transformer.layer.4.attention.k_lin.m_in distilbert.transformer.layer.4.attention.k_lin.scale_out distilbert.transformer.layer.4.attention.k_lin.scale_in distilbert.transformer.layer.4.attention.k_lin.lora.A distilbert.transformer.layer.4.attention.k_lin.lora.B distilbert.transformer.layer.4.attention.v_lin.m_out distilbert.transformer.layer.4.attention.v_lin.m_in distilbert.transformer.layer.4.attention.v_lin.scale_out distilbert.transformer.layer.4.attention.v_lin.scale_in distilbert.transformer.layer.4.attention.v_lin.lora.A distilbert.transformer.layer.4.attention.v_lin.lora.B distilbert.transformer.layer.4.attention.out_lin.m_out distilbert.transformer.layer.4.attention.out_lin.m_in distilbert.transformer.layer.4.attention.out_lin.scale_out distilbert.transformer.layer.4.attention.out_lin.scale_in distilbert.transformer.layer.4.attention.out_lin.lora.A distilbert.transformer.layer.4.attention.out_lin.lora.B pre_classifier.weight pre_classifier.bias classifier.weight classifier.bias distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1 distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead warnings.warn(
[1564/1564 1:24:45, Epoch 2/2]
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
50 | 0.129500 | 0.192208 | 0.928000 | 0.928000 |
100 | 0.102500 | 0.195713 | 0.926400 | 0.926365 |
150 | 0.098800 | 0.205262 | 0.921600 | 0.921644 |
200 | 0.070700 | 0.209579 | 0.924000 | 0.923957 |
250 | 0.086800 | 0.218905 | 0.918400 | 0.918477 |
300 | 0.088500 | 0.215237 | 0.926400 | 0.926352 |
350 | 0.098500 | 0.214950 | 0.924000 | 0.924028 |
400 | 0.107200 | 0.215835 | 0.920800 | 0.920840 |
450 | 0.129700 | 0.212644 | 0.924000 | 0.924006 |
500 | 0.139400 | 0.205913 | 0.921600 | 0.921575 |
550 | 0.139600 | 0.202001 | 0.927200 | 0.927092 |
600 | 0.147400 | 0.199926 | 0.924800 | 0.924800 |
650 | 0.110000 | 0.202905 | 0.925600 | 0.925459 |
700 | 0.107900 | 0.204202 | 0.924000 | 0.923970 |
750 | 0.123700 | 0.205488 | 0.927200 | 0.927133 |
800 | 0.123000 | 0.207789 | 0.920000 | 0.920024 |
850 | 0.094700 | 0.208785 | 0.921600 | 0.921612 |
900 | 0.090300 | 0.210170 | 0.926400 | 0.926326 |
950 | 0.100800 | 0.212387 | 0.924800 | 0.924724 |
1000 | 0.088100 | 0.213624 | 0.926400 | 0.926365 |
1050 | 0.101400 | 0.213821 | 0.926400 | 0.926365 |
1100 | 0.109700 | 0.213743 | 0.925600 | 0.925545 |
1150 | 0.083400 | 0.214692 | 0.926400 | 0.926326 |
1200 | 0.105700 | 0.214382 | 0.925600 | 0.925545 |
1250 | 0.114500 | 0.213811 | 0.924800 | 0.924738 |
1300 | 0.119800 | 0.213729 | 0.926400 | 0.926326 |
1350 | 0.142000 | 0.212341 | 0.926400 | 0.926326 |
1400 | 0.137100 | 0.211481 | 0.926400 | 0.926326 |
1450 | 0.149900 | 0.210727 | 0.927200 | 0.927133 |
1500 | 0.162000 | 0.210390 | 0.927200 | 0.927133 |
1550 | 0.163200 | 0.210198 | 0.927200 | 0.927133 |
|===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 552099 KiB | 12301 MiB | 354221 GiB | 354221 GiB | | from large pool | 541440 KiB | 12236 MiB | 352777 GiB | 352776 GiB | | from small pool | 10659 KiB | 67 MiB | 1444 GiB | 1444 GiB | |---------------------------------------------------------------------------| | Active memory | 552099 KiB | 12301 MiB | 354221 GiB | 354221 GiB | | from large pool | 541440 KiB | 12236 MiB | 352777 GiB | 352776 GiB | | from small pool | 10659 KiB | 67 MiB | 1444 GiB | 1444 GiB | |---------------------------------------------------------------------------| | Requested memory | 549886 KiB | 12297 MiB | 353679 GiB | 353679 GiB | | from large pool | 539228 KiB | 12231 MiB | 352242 GiB | 352241 GiB | | from small pool | 10658 KiB | 67 MiB | 1437 GiB | 1437 GiB | |---------------------------------------------------------------------------| | GPU reserved memory | 7954 MiB | 12590 MiB | 31890 MiB | 23936 MiB | | from large pool | 7904 MiB | 12520 MiB | 31716 MiB | 23812 MiB | | from small pool | 50 MiB | 70 MiB | 174 MiB | 124 MiB | |---------------------------------------------------------------------------| | Non-releasable memory | 76637 KiB | 319790 KiB | 65944 GiB | 65944 GiB | | from large pool | 68864 KiB | 314752 KiB | 64384 GiB | 64383 GiB | | from small pool | 7773 KiB | 43066 KiB | 1560 GiB | 1560 GiB | |---------------------------------------------------------------------------| | Allocations | 668 | 1318 | 33579 K | 33579 K | | from large pool | 80 | 298 | 9048 K | 9048 K | | from small pool | 588 | 1235 | 24531 K | 24530 K | |---------------------------------------------------------------------------| | Active allocs | 668 | 1318 | 33579 K | 33579 K | | from large pool | 80 | 298 | 9048 K | 9048 K | | from small pool | 588 | 1235 | 24531 K | 24530 K | |---------------------------------------------------------------------------| | GPU reserved segments | 154 | 271 | 643 | 489 | | from large pool | 129 | 236 | 556 | 427 | | from small pool | 25 | 35 | 87 | 62 | |---------------------------------------------------------------------------| | Non-releasable allocs | 33 | 90 | 14904 K | 14904 K | | from large pool | 18 | 24 | 1709 K | 1709 K | | from small pool | 15 | 73 | 13194 K | 13194 K | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================| distilbert.transformer.layer.0.attention.q_lin: |A|=0.2769, |B|=0.1618, |∇A|=2.095e-05, |∇B|=1.97e-05, |LoRA(x)|=2.076e+04, B≠0=12288 distilbert.transformer.layer.0.attention.k_lin: |A|=0.2662, |B|=0.1606, |∇A|=8.706e-06, |∇B|=1.546e-05, |LoRA(x)|=2.29e+04, B≠0=12288 distilbert.transformer.layer.0.attention.v_lin: |A|=0.2628, |B|=0.1347, |∇A|=1.436e-05, |∇B|=2.381e-05, |LoRA(x)|=2.267e+04, B≠0=12288 distilbert.transformer.layer.0.attention.out_lin: |A|=0.2499, |B|=0.1315, |∇A|=2.324e-05, |∇B|=5.055e-05, |LoRA(x)|=1.135e+04, B≠0=12288 distilbert.transformer.layer.0.ffn.lin1: |A|=0.2696, |B|=0.1625, |∇A|=0, |∇B|=0, |LoRA(x)|=5.201e+04, B≠0=49152 distilbert.transformer.layer.0.ffn.lin2: |A|=0.2408, |B|=0.131, |∇A|=0, |∇B|=0, |LoRA(x)|=1.624e+04, B≠0=12288 distilbert.transformer.layer.1.attention.q_lin: |A|=0.2634, |B|=0.1395, |∇A|=1.536e-05, |∇B|=1.773e-05, |LoRA(x)|=1.814e+04, B≠0=12288 distilbert.transformer.layer.1.attention.k_lin: |A|=0.2617, |B|=0.1574, |∇A|=1.364e-05, |∇B|=2.045e-05, |LoRA(x)|=1.879e+04, B≠0=12288 distilbert.transformer.layer.1.attention.v_lin: |A|=0.24, |B|=0.1254, |∇A|=1.798e-05, |∇B|=3.421e-05, |LoRA(x)|=1.623e+04, B≠0=12288 distilbert.transformer.layer.1.attention.out_lin: |A|=0.2471, |B|=0.1199, |∇A|=2.339e-05, |∇B|=5.574e-05, |LoRA(x)|=1.364e+04, B≠0=12288 distilbert.transformer.layer.1.ffn.lin1: |A|=0.2584, |B|=0.161, |∇A|=0, |∇B|=0, |LoRA(x)|=5.053e+04, B≠0=49152 distilbert.transformer.layer.1.ffn.lin2: |A|=0.262, |B|=0.1349, |∇A|=0, |∇B|=0, |LoRA(x)|=1.41e+04, B≠0=12288 distilbert.transformer.layer.2.attention.q_lin: |A|=0.2674, |B|=0.1574, |∇A|=1.91e-05, |∇B|=2.529e-05, |LoRA(x)|=2.3e+04, B≠0=12288 distilbert.transformer.layer.2.attention.k_lin: |A|=0.2654, |B|=0.1596, |∇A|=1.934e-05, |∇B|=2.576e-05, |LoRA(x)|=2.211e+04, B≠0=12288 distilbert.transformer.layer.2.attention.v_lin: |A|=0.239, |B|=0.1072, |∇A|=8.179e-06, |∇B|=2.949e-05, |LoRA(x)|=3.823e+04, B≠0=12288 distilbert.transformer.layer.2.attention.out_lin: |A|=0.2511, |B|=0.1348, |∇A|=4.259e-05, |∇B|=6.823e-05, |LoRA(x)|=1.03e+04, B≠0=12288 distilbert.transformer.layer.2.ffn.lin1: |A|=0.2741, |B|=0.1599, |∇A|=0, |∇B|=0, |LoRA(x)|=7.083e+04, B≠0=49152 distilbert.transformer.layer.2.ffn.lin2: |A|=0.2514, |B|=0.1236, |∇A|=0, |∇B|=0, |LoRA(x)|=1.92e+04, B≠0=12288 distilbert.transformer.layer.3.attention.q_lin: |A|=0.2613, |B|=0.1663, |∇A|=1.421e-05, |∇B|=2.463e-05, |LoRA(x)|=2.365e+04, B≠0=12288 distilbert.transformer.layer.3.attention.k_lin: |A|=0.2784, |B|=0.1723, |∇A|=2.856e-05, |∇B|=2.999e-05, |LoRA(x)|=2.423e+04, B≠0=12288 distilbert.transformer.layer.3.attention.v_lin: |A|=0.2536, |B|=0.1229, |∇A|=1.677e-05, |∇B|=2.682e-05, |LoRA(x)|=2.236e+04, B≠0=12288 distilbert.transformer.layer.3.attention.out_lin: |A|=0.2565, |B|=0.1265, |∇A|=5.192e-05, |∇B|=4.08e-05, |LoRA(x)|=1.278e+04, B≠0=12288 distilbert.transformer.layer.3.ffn.lin1: |A|=0.249, |B|=0.1445, |∇A|=0, |∇B|=0, |LoRA(x)|=4.909e+04, B≠0=49152 distilbert.transformer.layer.3.ffn.lin2: |A|=0.248, |B|=0.1179, |∇A|=0, |∇B|=0, |LoRA(x)|=2.538e+04, B≠0=12288 distilbert.transformer.layer.4.attention.q_lin: |A|=0.254, |B|=0.1624, |∇A|=1.096e-05, |∇B|=2.417e-05, |LoRA(x)|=3.029e+04, B≠0=12288 distilbert.transformer.layer.4.attention.k_lin: |A|=0.2586, |B|=0.155, |∇A|=2.714e-05, |∇B|=2.183e-05, |LoRA(x)|=2.354e+04, B≠0=12288 distilbert.transformer.layer.4.attention.v_lin: |A|=0.2423, |B|=0.1148, |∇A|=1.434e-05, |∇B|=1.919e-05, |LoRA(x)|=2.227e+04, B≠0=12288 distilbert.transformer.layer.4.attention.out_lin: |A|=0.2577, |B|=0.1189, |∇A|=3.38e-05, |∇B|=3.305e-05, |LoRA(x)|=1.403e+04, B≠0=12288 distilbert.transformer.layer.4.ffn.lin1: |A|=0.2543, |B|=0.1333, |∇A|=0, |∇B|=0, |LoRA(x)|=5.194e+04, B≠0=49151 distilbert.transformer.layer.4.ffn.lin2: |A|=0.2369, |B|=0.1056, |∇A|=0, |∇B|=0, |LoRA(x)|=7.262e+04, B≠0=12288 distilbert.transformer.layer.5.attention.q_lin: |A|=0.2567, |B|=0.157, |∇A|=0, |∇B|=0, |LoRA(x)|=4.137e+04, B≠0=12288 distilbert.transformer.layer.5.attention.k_lin: |A|=0.2547, |B|=0.1376, |∇A|=0, |∇B|=0, |LoRA(x)|=3.322e+04, B≠0=12288 distilbert.transformer.layer.5.attention.v_lin: |A|=0.2409, |B|=0.093, |∇A|=0, |∇B|=0, |LoRA(x)|=2.855e+04, B≠0=12288 distilbert.transformer.layer.5.attention.out_lin: |A|=0.2486, |B|=0.09723, |∇A|=0, |∇B|=0, |LoRA(x)|=1.485e+04, B≠0=12288 distilbert.transformer.layer.5.ffn.lin1: |A|=0.2278, |B|=0.1059, |∇A|=0, |∇B|=0, |LoRA(x)|=1.007e+05, B≠0=49152 distilbert.transformer.layer.5.ffn.lin2: |A|=0.2222, |B|=0.09406, |∇A|=0, |∇B|=0, |LoRA(x)|=1.188e+05, B≠0=12288
Training summary¶
- No obvious overfitting or catastrophic drift. Validation loss is slightly noisy but flat. Validation accuracy/F1 score stays around 92.6–92.8% consistently from step ~300 onward.
- |LoRA(x)| magnitudes are healthy, ranging from 1e4 to 1e5, |∇A| and |∇B| are non-zero in all layers 0-4.
In [12]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lora" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.scale" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
if "lin.m" in name:
print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2749776840209961 distilbert.transformer.layer.0.attention.q_lin.lora.B 0.16095973551273346 distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2651287913322449 distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1600244641304016 distilbert.transformer.layer.0.attention.v_lin.lora.A 0.2614787518978119 distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1331043690443039 distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2485843151807785 distilbert.transformer.layer.0.attention.out_lin.lora.B 0.13032867014408112 distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2695589065551758 distilbert.transformer.layer.0.ffn.lin1.lora.B 0.1625066101551056 distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2408287227153778 distilbert.transformer.layer.0.ffn.lin2.lora.B 0.13101235032081604 distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2617260217666626 distilbert.transformer.layer.1.attention.q_lin.lora.B 0.1386014223098755 distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2603480815887451 distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15657925605773926 distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2385316789150238 distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12427060306072235 distilbert.transformer.layer.1.attention.out_lin.lora.A 0.24628107249736786 distilbert.transformer.layer.1.attention.out_lin.lora.B 0.11874490976333618 distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2583957612514496 distilbert.transformer.layer.1.ffn.lin1.lora.B 0.16097836196422577 distilbert.transformer.layer.1.ffn.lin2.lora.A 0.26195210218429565 distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13490238785743713 distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2662392556667328 distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15666478872299194 distilbert.transformer.layer.2.attention.k_lin.lora.A 0.2635606527328491 distilbert.transformer.layer.2.attention.k_lin.lora.B 0.15871921181678772 distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23818302154541016 distilbert.transformer.layer.2.attention.v_lin.lora.B 0.10619251430034637 distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24991869926452637 distilbert.transformer.layer.2.attention.out_lin.lora.B 0.1336245983839035 distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2741438150405884 distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1599275767803192 distilbert.transformer.layer.2.ffn.lin2.lora.A 0.251351535320282 distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12355349957942963 distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25989094376564026 distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1655236780643463 distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2773197889328003 distilbert.transformer.layer.3.attention.k_lin.lora.B 0.1714954972267151 distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2522783577442169 distilbert.transformer.layer.3.attention.v_lin.lora.B 0.12189989537000656 distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2552739977836609 distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1252676546573639 distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24903497099876404 distilbert.transformer.layer.3.ffn.lin1.lora.B 0.14453813433647156 distilbert.transformer.layer.3.ffn.lin2.lora.A 0.24802027642726898 distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11787533760070801 distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2526055574417114 distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16136936843395233 distilbert.transformer.layer.4.attention.k_lin.lora.A 0.25712794065475464 distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1542537808418274 distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24055878818035126 distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11367492377758026 distilbert.transformer.layer.4.attention.out_lin.lora.A 0.25640028715133667 distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11762168258428574 distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2542564272880554 distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13327965140342712 distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2369489073753357 distilbert.transformer.layer.4.ffn.lin2.lora.B 0.10564354062080383 distilbert.transformer.layer.5.attention.q_lin.lora.A 0.25673991441726685 distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1569884866476059 distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2547406554222107 distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1376175880432129 distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24085545539855957 distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09300228208303452 distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24862870573997498 distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09722810983657837 distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22779500484466553 distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1058943122625351 distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22217977046966553 distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09405812621116638 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.7124 distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.6398 distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.3590 distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.5018 distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.7891 distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.9860 distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 35.1135 distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.5258 distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 38.2489 distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.9634 distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 68.6745 distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.7223 distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.9099 distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.7729 distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.7043 distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 22.0986 distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 33.4483 distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.5916 distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.7149 distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.7974 distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 36.3546 distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.4047 distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 74.4587 distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.2996 distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.4187 distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.8922 distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.9623 distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.2086 distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.3070 distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.8621 distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 35.3756 distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.9542 distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.8654 distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.5287 distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 71.2674 distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.6604 distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.5975 distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.2939 distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 39.3017 distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 24.1747 distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.6658 distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.2920 distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.1697 distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.6819 distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 35.1100 distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.8434 distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.7395 distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.8413 distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.6475 distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.5161 distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 36.2313 distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.6272 distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.1842 distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 16.1350 distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.4880 distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.7542 distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.7436 distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.9658 distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.8764 distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.3375 distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 36.0503 distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 22.1811 distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 36.0766 distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.7476 distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.1035 distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 13.0682 distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 35.0975 distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.7873 distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.9677 distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 30.2731 distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 62.3751 distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.9422 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0837180614471436 distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9779415130615234 distilbert.transformer.layer.0.attention.k_lin.scale_out 2.062652111053467 distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9639686346054077 distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9488314390182495 distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9646772146224976 distilbert.transformer.layer.0.attention.out_lin.scale_out 1.909591555595398 distilbert.transformer.layer.0.attention.out_lin.scale_in 1.948154330253601 distilbert.transformer.layer.1.attention.q_lin.scale_out 2.0283894538879395 distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9835941791534424 distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9929611682891846 distilbert.transformer.layer.1.attention.k_lin.scale_in 1.965177297592163 distilbert.transformer.layer.1.attention.v_lin.scale_out 1.931127905845642 distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9647105932235718 distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9083741903305054 distilbert.transformer.layer.1.attention.out_lin.scale_in 1.95680832862854 distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0393614768981934 distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9570300579071045 distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0406978130340576 distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9585788249969482 distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9163254499435425 distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9447929859161377 distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9148540496826172 distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9425309896469116 distilbert.transformer.layer.3.attention.q_lin.scale_out 2.020709991455078 distilbert.transformer.layer.3.attention.q_lin.scale_in 1.950387716293335 distilbert.transformer.layer.3.attention.k_lin.scale_out 2.052197217941284 distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9654889106750488 distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9081611633300781 distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9228265285491943 distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9347209930419922 distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9294712543487549 distilbert.transformer.layer.4.attention.q_lin.scale_out 2.056077480316162 distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9218626022338867 distilbert.transformer.layer.4.attention.k_lin.scale_out 2.069587230682373 distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9484262466430664 distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8914620876312256 distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9186089038848877 distilbert.transformer.layer.4.attention.out_lin.scale_out 1.923946738243103 distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9129976034164429 distilbert.transformer.layer.5.attention.q_lin.scale_out 1.989418387413025 distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9430391788482666 distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071225643157959 distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9252580404281616 distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8846511840820312 distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9164375066757202 distilbert.transformer.layer.5.attention.out_lin.scale_out 1.880750060081482 distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9473273754119873 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.4084 distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2926 distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7943 distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8718 distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.4596 distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.8923 distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.3308 distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3720 distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.6834 distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3731 distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7892 distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8925 distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.8451 distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7039 distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2424 distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.5817 distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 57.0762 distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6677 distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.1677 distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.6647 distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.4624 distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.1592 distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5978 distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.2466 distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.6097 distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4790 distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.5021 distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9606 distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.3222 distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7177 distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.1458 distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.9458 distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.6845 distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7307 distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.9948 distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3578 distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.1103 distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.5967 distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.8903 distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.5606 distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7921 distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.2675 distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9446 distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.7487 distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.0761 distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.5358 distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.6063 distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.3667 Parameter Statistics: mean.abs() distilbert.transformer.layer.0.attention.q_lin.m_out 0.37365245819091797 distilbert.transformer.layer.0.attention.q_lin.m_in 0.2769436240196228 distilbert.transformer.layer.0.attention.k_lin.m_out 0.36940622329711914 distilbert.transformer.layer.0.attention.k_lin.m_in 0.2654905617237091 distilbert.transformer.layer.0.attention.v_lin.m_out 0.2665351629257202 distilbert.transformer.layer.0.attention.v_lin.m_in 0.25776490569114685 distilbert.transformer.layer.0.attention.out_lin.m_out 0.23288491368293762 distilbert.transformer.layer.0.attention.out_lin.m_in 0.23689687252044678 distilbert.transformer.layer.1.attention.q_lin.m_out 0.31158336997032166 distilbert.transformer.layer.1.attention.q_lin.m_in 0.2566670775413513 distilbert.transformer.layer.1.attention.k_lin.m_out 0.3114917278289795 distilbert.transformer.layer.1.attention.k_lin.m_in 0.25877076387405396 distilbert.transformer.layer.1.attention.v_lin.m_out 0.23386649787425995 distilbert.transformer.layer.1.attention.v_lin.m_in 0.24185800552368164 distilbert.transformer.layer.1.attention.out_lin.m_out 0.2300492525100708 distilbert.transformer.layer.1.attention.out_lin.m_in 0.23636046051979065 distilbert.transformer.layer.2.attention.q_lin.m_out 0.34007567167282104 distilbert.transformer.layer.2.attention.q_lin.m_in 0.2554404139518738 distilbert.transformer.layer.2.attention.k_lin.m_out 0.33773207664489746 distilbert.transformer.layer.2.attention.k_lin.m_in 0.2607421278953552 distilbert.transformer.layer.2.attention.v_lin.m_out 0.23071661591529846 distilbert.transformer.layer.2.attention.v_lin.m_in 0.2291429042816162 distilbert.transformer.layer.2.attention.out_lin.m_out 0.2561138868331909 distilbert.transformer.layer.2.attention.out_lin.m_in 0.23889867961406708 distilbert.transformer.layer.3.attention.q_lin.m_out 0.32864949107170105 distilbert.transformer.layer.3.attention.q_lin.m_in 0.2578366696834564 distilbert.transformer.layer.3.attention.k_lin.m_out 0.35121220350265503 distilbert.transformer.layer.3.attention.k_lin.m_in 0.2772657871246338 distilbert.transformer.layer.3.attention.v_lin.m_out 0.23328649997711182 distilbert.transformer.layer.3.attention.v_lin.m_in 0.24039456248283386 distilbert.transformer.layer.3.attention.out_lin.m_out 0.25510165095329285 distilbert.transformer.layer.3.attention.out_lin.m_in 0.2509150505065918 distilbert.transformer.layer.4.attention.q_lin.m_out 0.3528974950313568 distilbert.transformer.layer.4.attention.q_lin.m_in 0.23650650680065155 distilbert.transformer.layer.4.attention.k_lin.m_out 0.3678012490272522 distilbert.transformer.layer.4.attention.k_lin.m_in 0.24499206244945526 distilbert.transformer.layer.4.attention.v_lin.m_out 0.24949286878108978 distilbert.transformer.layer.4.attention.v_lin.m_in 0.2326044738292694 distilbert.transformer.layer.4.attention.out_lin.m_out 0.2516971230506897 distilbert.transformer.layer.4.attention.out_lin.m_in 0.22889690101146698 distilbert.transformer.layer.5.attention.q_lin.m_out 0.3036739230155945 distilbert.transformer.layer.5.attention.q_lin.m_in 0.251531720161438 distilbert.transformer.layer.5.attention.k_lin.m_out 0.3580213785171509 distilbert.transformer.layer.5.attention.k_lin.m_in 0.23563992977142334 distilbert.transformer.layer.5.attention.v_lin.m_out 0.25412318110466003 distilbert.transformer.layer.5.attention.v_lin.m_in 0.2263183444738388 distilbert.transformer.layer.5.attention.out_lin.m_out 0.20213589072227478 distilbert.transformer.layer.5.attention.out_lin.m_in 0.23632609844207764 Parameter Statistics: param.norm() distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 12.5664 distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.8370 distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 12.4071 distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4924 distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.3731 distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3112 distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.2969 distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8274 distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.7503 distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.1579 distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.8811 distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.3415 distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.3309 distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.7695 distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.3219 distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7636 distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.6643 distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.1391 distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.6859 distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.3386 distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.4619 distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.3938 distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.1581 distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.8639 distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.3407 distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.3832 distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.0453 distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.8448 distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.5036 distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1459 distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.1446 distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.5005 distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.2625 distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.7035 distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.5059 distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.8216 distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 9.0152 distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.0068 distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 8.9888 distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.0314 distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.8569 distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.2505 distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.0845 distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.9086 distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4463 distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.8336 distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.7024 distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.9938
In [ ]: