In [1]:
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(torch.cuda.get_device_name(0))

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

import bitsandbytes
import peft
import transformers

print(transformers.__version__)

print(f"bitsandbytes version: {bitsandbytes.__version__}")
print(f"peft version: {peft.__version__}")
print(torch.cuda.is_bf16_supported())

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]
PyTorch Version: 2.5.1+cu121
True
1
CUDA Version: 12.1
NVIDIA GeForce RTX 4080 Laptop GPU
4.50.0.dev0
bitsandbytes version: 0.45.3
peft version: 0.15.2.dev0
True
In [2]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset = imdb_dataset.rename_column("label", "labels")
# Split the test set into validation and test sets
test_val_split = imdb_dataset['test'].train_test_split(test_size=0.95, seed=42)
imdb_dataset['validation'] = test_val_split['train']
imdb_dataset['test'] = test_val_split['test']

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# Determine the number of labels
num_labels = len(set(imdb_dataset["train"]["labels"]))
print(f"Number of labels: {num_labels}")

# Load the tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenize the whole dataset, truncate to 384 tokens
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=384)

dataset_encoded = imdb_dataset.map(tokenize, batched=True, batch_size=None)

# Load the pretrained model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))
#print(model)
Number of labels: 2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [3]:
# Helper functions
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}
    
def count_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params, 100 * trainable_params / total_params

def freeze_model_layers(model, unfreeze_pre_classifier=False):
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze LoRA and DoRA-specific params, including lora_norm
    for name, param in model.named_parameters():
        if (
            "lora.A" in name
            or "lora.B" in name
            or "lora_norm" in name  
            or name.endswith(".m")   # For DoRA
            or name.endswith(".m_in") # For DDoRA
            or name.endswith(".m_out") # For DDoRA
            or "scale" in name
        ):
            param.requires_grad = True

    # Unfreeze classifier layer (always)
    for name, param in model.named_parameters():
        if name.startswith("classifier."):
            param.requires_grad = True

    # unfreeze pre-classifier
    if unfreeze_pre_classifier:
        for name, param in model.named_parameters():
            if name.startswith("pre_classifier."):
                param.requires_grad = True

Double DoRA (DDoRA)¶

Double Weight-Decomposed Low-Rank Adaptation¶

In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha, dropout_rate=0.0):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(1e-4 * torch.randn(rank, out_dim) * std_dev)  # not all zeroes!
        self.alpha = alpha
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        dropped = self.dropout(x @ self.A)
        return self.alpha * (dropped @ self.B)


class LinearWithDoubleDoRA(nn.Module):
    def __init__(self, linear, rank, alpha, scaling_factor=1.0):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())        
        self.m_out = nn.Parameter(torch.randn(1, linear.out_features) * std_dev)
        self.m_in = nn.Parameter(torch.randn(linear.in_features, 1) * std_dev)   
        # Orthogonal initialization for m_out
        #self.m_out = nn.Parameter(torch.empty(1, linear.out_features))
        #nn.init.orthogonal_(self.m_out)
        # Orthogonal initialization for m_in
        #self.m_in = nn.Parameter(torch.empty(linear.in_features, 1))
        #nn.init.orthogonal_(self.m_in)        
        self.scale_out = nn.Parameter(torch.full((1, linear.out_features), float(scaling_factor)))
        self.scale_in = nn.Parameter(torch.full((linear.in_features, 1), float(scaling_factor)))
        self.last_lora_output_norm = 0.0  # For monitoring

    def forward(self, x):
        scaled_x = x * self.scale_in.T * self.m_in.T
        linear_output = self.linear(x)
        lora_output = self.lora(scaled_x)
        lora_output_norm = lora_output / (lora_output.norm(p=2, dim=1, keepdim=True) + 1e-9)
        self.last_lora_output_norm = lora_output.norm(p=2, dim=-1).mean().item()
        dora_modification = self.scale_out * self.m_out * lora_output_norm
        return linear_output + dora_modification



def inject_ddora_all_attn(model, rank, alpha, scaling_factor=1.0, dropout_rate=0.0, disable_layers=None):
    target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin", "ffn.lin1", "ffn.lin2"]
    #target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"]
    if disable_layers is None:
        disable_layers = []

    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(layer in name for layer in target_layers):
            # Try to extract layer index from names like "transformer.layer.4.attention.q_lin"
            parts = name.split('.')
            layer_idx = None
            for i, part in enumerate(parts):
                if part == "layer" and i + 1 < len(parts):
                    try:
                        layer_idx = int(parts[i + 1])
                        break
                    except ValueError:
                        pass

            if layer_idx is not None and layer_idx in disable_layers:
                continue

            parent_name = name.rsplit('.', 1)[0]
            parent_module = model.get_submodule(parent_name)
            original_linear = getattr(parent_module, name.split('.')[-1])

            ddora_layer = LinearWithDoubleDoRA(original_linear, rank, alpha, scaling_factor)
            ddora_layer.lora.dropout = nn.Dropout(dropout_rate)

            setattr(parent_module, name.split('.')[-1], ddora_layer)

    return model
In [5]:
def monitor_lora_parameters(model, threshold=1e-7):
    monitor = {
        "A_abs_mean": [],
        "B_abs_mean": [],
        "A_grad_mean": [],
        "B_grad_mean": [],
        "lora_output_norm": [],
        "B_nonzero_count": [],
    }
    hooks = []

    for name, module in model.named_modules():
        if hasattr(module, "lora") and hasattr(module.lora, "A") and hasattr(module.lora, "B"):
            A_param = module.lora.A
            B_param = module.lora.B

            # Gradient hooks (directly on nn.Parameter)
            if A_param.requires_grad:
                hooks.append(A_param.register_hook(lambda grad, n=name: monitor["A_grad_mean"].append((n, grad.abs().mean().item()))))
            if B_param.requires_grad:
                hooks.append(B_param.register_hook(lambda grad, n=name: monitor["B_grad_mean"].append((n, grad.abs().mean().item()))))

            # Forward hook for value stats
            def forward_hook(mod, inp, out, n=name):
                A_mean = mod.lora.A.abs().mean().item()
                B_mean = mod.lora.B.abs().mean().item()
                B_nnz = (mod.lora.B.abs() > threshold).sum().item()
                monitor["A_abs_mean"].append((n, A_mean))
                monitor["B_abs_mean"].append((n, B_mean))
                monitor["B_nonzero_count"].append((n, B_nnz))
                monitor["lora_output_norm"].append((n, mod.last_lora_output_norm))

            hooks.append(module.register_forward_hook(forward_hook))

    return hooks, monitor

from transformers import TrainingArguments

def monitor_gradients(model):
    hooks = []
    gradient_history = {}

    for name, param in model.named_parameters():
        if param.requires_grad:
            gradient_history[name] = []

            def get_hook(n):  # capture the name immediately
                def hook(grad):
                    gradient_history[n].append(grad.abs().mean().item())
                return hook

            hooks.append(param.register_hook(get_hook(name)))
    return hooks, gradient_history
  1. This study investigates stability of LoRA based adaptor training on IMDb dataset. It draws on pure LoRA training (https://lzrdgreen.github.io/LLMs/LoRAonIMDB.html) on IMDb and the lessons learned there. With zero dropout ∣B∣ is nearly 3 orders of magnitude smaller than ∣A∣. Even though ∣∇B∣ is much larger than ∣∇A∣ learning is slow and inefficient as weight update is determined by the the product of BA (as B has small magnitude, B becomes a weak projector. So, despite big gradients, B doesn’t learn enough — most burden falls on A.). You can also see all 3 adaptors (LoRA, DoRA, and DDoRA - a natural development of the DoRA idea of directional scaling to both in- and -out layers) trained on IMDb with zero dropout here: https://lzrdgreen.github.io/LLMs/adapters.html
  2. As we saw in https://lzrdgreen.github.io/LLMs/LoRAonIMDB.html, increase of the dropout leads to significant change in training dynamics: with dropout = 40% (applied after projection with matrix A and before the final projection with matrix B) B is forced to adapt more robustly over time.
  3. It is easy to check that further increase in dropout leads to more adaptation of the matrix B, however as the LoRA's effective update is the product of BA, the training is destroyed by too much noise in the projection due to the matrix A.
In [6]:
learning_rate = 1e-2 #############
dropout = 0.3 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
output_dir_prefix = "finetuned-imdb-"

import copy
torch.manual_seed(137)

model_ddora_all_attn = copy.deepcopy(model)
model_ddora_all_attn = inject_ddora_all_attn(model_ddora_all_attn, lora_rank, lora_alpha, scaling_factor, dropout)
freeze_model_layers(model_ddora_all_attn, unfreeze_pre_classifier=True)

total_params_ddora, trainable_params_ddora, percentage_ddora = count_trainable_parameters(model_ddora_all_attn)
print(f"\nDDoRA (All Attention) - Total parameters: {total_params_ddora:,}")
print(f"DDoRA (All Attention) - Trainable parameters: {trainable_params_ddora:,} ({percentage_ddora:.2f}%)")

# Sanity check
#print("\nTrainable parameters after freezing:")
#for name, param in model_ddora_all_attn.named_parameters():
#    if param.requires_grad:
#        print(name)

from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=2, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0, #####
    report_to="none",
    log_level="error"
)

    
trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

# Example output
for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
DDoRA (All Attention) - Total parameters: 68,448,002
DDoRA (All Attention) - Trainable parameters: 2,085,122 (3.05%)
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[1564/1564 2:06:50, Epoch 2/2]
Step Training Loss Validation Loss Accuracy F1
50 0.583200 0.284775 0.880800 0.880623
100 0.331500 0.318036 0.865600 0.865880
150 0.334500 0.290622 0.888800 0.888612
200 0.275800 0.277798 0.884800 0.885054
250 0.289800 0.259306 0.896800 0.896963
300 0.272700 0.256442 0.902400 0.901592
350 0.289100 0.249133 0.907200 0.906665
400 0.252400 0.246030 0.907200 0.907287
450 0.255500 0.275144 0.906400 0.905762
500 0.268000 0.227364 0.906400 0.906347
550 0.258200 0.225571 0.913600 0.913558
600 0.273100 0.239456 0.916000 0.916031
650 0.231700 0.223158 0.915200 0.915013
700 0.230700 0.221400 0.915200 0.915031
750 0.231000 0.268203 0.913600 0.912997
800 0.238900 0.264123 0.904000 0.903140
850 0.214300 0.211674 0.917600 0.917539
900 0.195900 0.220456 0.920000 0.919857
950 0.215500 0.249347 0.916000 0.915967
1000 0.208900 0.222482 0.918400 0.918332
1050 0.189600 0.200232 0.919200 0.919229
1100 0.208800 0.206135 0.911200 0.911267
1150 0.178700 0.208787 0.919200 0.919240
1200 0.187000 0.220926 0.918400 0.918518
1250 0.194600 0.200717 0.922400 0.922343
1300 0.209100 0.199653 0.921600 0.921575
1350 0.207200 0.199614 0.916800 0.916836
1400 0.193000 0.193851 0.924800 0.924822
1450 0.175600 0.203755 0.924000 0.923930
1500 0.183900 0.198105 0.924000 0.923930
1550 0.191100 0.196167 0.924000 0.923970

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 564070 KiB |  12301 MiB | 121040 GiB | 121039 GiB |
|       from large pool | 546048 KiB |  12236 MiB | 120553 GiB | 120552 GiB |
|       from small pool |  18022 KiB |     67 MiB |    487 GiB |    487 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 564070 KiB |  12301 MiB | 121040 GiB | 121039 GiB |
|       from large pool | 546048 KiB |  12236 MiB | 120553 GiB | 120552 GiB |
|       from small pool |  18022 KiB |     67 MiB |    487 GiB |    487 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 561856 KiB |  12297 MiB | 120831 GiB | 120831 GiB |
|       from large pool | 543836 KiB |  12231 MiB | 120346 GiB | 120346 GiB |
|       from small pool |  18020 KiB |     67 MiB |    485 GiB |    485 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  12590 MiB |  12590 MiB |  12590 MiB |      0 B   |
|       from large pool |  12520 MiB |  12520 MiB |  12520 MiB |      0 B   |
|       from small pool |     70 MiB |     70 MiB |     70 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Non-releasable memory |  72858 KiB | 144757 KiB |  10543 GiB |  10543 GiB |
|       from large pool |  64256 KiB | 137344 KiB |  10019 GiB |  10019 GiB |
|       from small pool |   8602 KiB |  43066 KiB |    523 GiB |    523 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     868    |    1318    |   11614 K  |   11613 K  |
|       from large pool |      82    |     298    |    3067 K  |    3067 K  |
|       from small pool |     786    |    1235    |    8547 K  |    8546 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     868    |    1318    |   11614 K  |   11613 K  |
|       from large pool |      82    |     298    |    3067 K  |    3067 K  |
|       from small pool |     786    |    1235    |    8547 K  |    8546 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     271    |     271    |     271    |       0    |
|       from large pool |     236    |     236    |     236    |       0    |
|       from small pool |      35    |      35    |      35    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      41    |      90    |    4965 K  |    4965 K  |
|       from large pool |      18    |      23    |     444 K  |     444 K  |
|       from small pool |      23    |      73    |    4520 K  |    4520 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2544, |B|=0.1359, |∇A|=1.05e-05, |∇B|=1.63e-05, |LoRA(x)|=2.025e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2467, |B|=0.1347, |∇A|=5.453e-06, |∇B|=1.36e-05, |LoRA(x)|=2.095e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2444, |B|=0.1122, |∇A|=6.981e-06, |∇B|=2.181e-05, |LoRA(x)|=2.793e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.2346, |B|=0.1088, |∇A|=1.251e-05, |∇B|=4.782e-05, |LoRA(x)|=1.586e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.2488, |B|=0.1355, |∇A|=2.653e-05, |∇B|=1.821e-05, |LoRA(x)|=4.497e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2276, |B|=0.1085, |∇A|=6.454e-06, |∇B|=4.911e-05, |LoRA(x)|=2.049e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2437, |B|=0.117, |∇A|=8.031e-06, |∇B|=1.606e-05, |LoRA(x)|=2.241e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2425, |B|=0.132, |∇A|=8.43e-06, |∇B|=1.863e-05, |LoRA(x)|=1.773e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2266, |B|=0.107, |∇A|=1.311e-05, |∇B|=4.143e-05, |LoRA(x)|=1.756e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2345, |B|=0.1007, |∇A|=1.681e-05, |∇B|=6.981e-05, |LoRA(x)|=1.455e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2421, |B|=0.1354, |∇A|=2.222e-05, |∇B|=2.405e-05, |LoRA(x)|=4.663e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.243, |B|=0.1093, |∇A|=8.305e-06, |∇B|=6.969e-05, |LoRA(x)|=2.158e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2483, |B|=0.1337, |∇A|=1.414e-05, |∇B|=2.872e-05, |LoRA(x)|=2.192e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2465, |B|=0.1365, |∇A|=1.158e-05, |∇B|=2.203e-05, |LoRA(x)|=2.198e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2268, |B|=0.09255, |∇A|=6.758e-06, |∇B|=5.326e-05, |LoRA(x)|=3.84e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2352, |B|=0.1127, |∇A|=3.042e-05, |∇B|=8.393e-05, |LoRA(x)|=1.015e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.2533, |B|=0.1324, |∇A|=1.371e-05, |∇B|=2.008e-05, |LoRA(x)|=9.044e+04, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2372, |B|=0.1021, |∇A|=7.97e-06, |∇B|=7.575e-05, |LoRA(x)|=2.32e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2431, |B|=0.1419, |∇A|=1.03e-05, |∇B|=2.381e-05, |LoRA(x)|=2.253e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2587, |B|=0.146, |∇A|=1.849e-05, |∇B|=2.731e-05, |LoRA(x)|=2.278e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2391, |B|=0.1046, |∇A|=1.518e-05, |∇B|=4.536e-05, |LoRA(x)|=2.051e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2426, |B|=0.1088, |∇A|=4.642e-05, |∇B|=6.672e-05, |LoRA(x)|=1.067e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2369, |B|=0.1235, |∇A|=3.247e-05, |∇B|=3.287e-05, |LoRA(x)|=3.809e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2361, |B|=0.09859, |∇A|=8.055e-06, |∇B|=7.404e-05, |LoRA(x)|=2.655e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.2393, |B|=0.1376, |∇A|=1.142e-05, |∇B|=3.362e-05, |LoRA(x)|=2.431e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2429, |B|=0.132, |∇A|=2.908e-05, |∇B|=2.764e-05, |LoRA(x)|=2.04e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2302, |B|=0.0967, |∇A|=9.844e-06, |∇B|=3.014e-05, |LoRA(x)|=2.18e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2426, |B|=0.09934, |∇A|=2.959e-05, |∇B|=7.185e-05, |LoRA(x)|=1.232e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.2427, |B|=0.1143, |∇A|=1.822e-05, |∇B|=1.799e-05, |LoRA(x)|=4.323e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2287, |B|=0.09016, |∇A|=2.057e-06, |∇B|=5.98e-05, |LoRA(x)|=6.813e+04, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2438, |B|=0.1335, |∇A|=8.323e-06, |∇B|=3.188e-05, |LoRA(x)|=3.349e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2443, |B|=0.1176, |∇A|=2.251e-05, |∇B|=1.555e-05, |LoRA(x)|=2.596e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2302, |B|=0.08092, |∇A|=5.421e-06, |∇B|=2.434e-05, |LoRA(x)|=2.488e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2377, |B|=0.08437, |∇A|=1.694e-05, |∇B|=7.318e-05, |LoRA(x)|=1.172e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2207, |B|=0.09195, |∇A|=4.348e-06, |∇B|=1.028e-05, |LoRA(x)|=6.974e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2169, |B|=0.08306, |∇A|=1.032e-06, |∇B|=5.664e-05, |LoRA(x)|=8.686e+04, B≠0=12288

Training summary¶

  1. With dropout=0.3: Dropout in LoRA path forces A to be noisy/incomplete, pushing B to compensate. Now, ‖B‖ ~ 50–60% of ‖A‖, across the board.
  2. ∇B consistently larger than ∇A, meaning B is: actively adapting, holding a representational burden, and no longer a passive post-multiplier. This is critical: dropout enables B to become a full partner in the learned LoRA basis, not a ghost. Dropout is essential for equalising the adaptation pressure on A and B.
  3. LoRA(x) magnitudes are really high but reasonable and not explosive. Further gorwth of |LoRA(x)| may destabilise training easily.
  4. In DDoRA we add: m_in, m_out, and directional scale factors. These interact nonlinearly - that increases the risk of instability even further. However training is stable so far: no explosive norms or vanishing gradients.
In [7]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2702498137950897
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.15954196453094482
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2613638639450073
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.15853147208690643
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.25769156217575073
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.130849689245224
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2441953420639038
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.12809059023857117
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2653345465660095
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.16109466552734375
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2382393330335617
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.12931501865386963
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2573601007461548
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.13697725534439087
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.25658535957336426
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.1549973338842392
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.23516318202018738
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12259270995855331
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2436894178390503
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.11648242175579071
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2548418641090393
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.15947595238685608
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.259360134601593
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13327041268348694
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2628241777420044
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15530811250209808
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.25873541831970215
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.15722517669200897
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23438052833080292
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.1039535403251648
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24704134464263916
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.13156524300575256
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2707127332687378
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1584470272064209
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.24950599670410156
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12168612331151962
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25530001521110535
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.16401663422584534
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.27414608001708984
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.17030774056911469
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2488800585269928
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.11997505277395248
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.25246256589889526
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.12312033027410507
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24574077129364014
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.1428002119064331
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.2467063069343567
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11558239161968231
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.24940571188926697
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.15973596274852753
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.25356122851371765
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.15287962555885315
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.23811078071594238
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11120724678039551
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.25315380096435547
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11443768441677094
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2513493597507477
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13147471845149994
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.235763818025589
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.1030246838927269
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2538462281227112
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.15554296970367432
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.25260573625564575
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.13536706566810608
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2386242002248764
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09103643894195557
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.2460632026195526
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.0942901223897934
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22521410882472992
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.10391643643379211
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22027425467967987
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09275849908590317
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.1128
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.4404
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 36.8767
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.2844
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.3068
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.6921
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 34.4764
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.2422
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 37.7293
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.6011
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 67.8830
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.4862
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.3440
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.5374
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.2538
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 21.8859
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 32.9675
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.3118
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.3291
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.4920
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 35.8611
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.0163
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 73.6737
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.0634
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 36.9587
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.6903
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.3556
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 21.9942
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 32.8011
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.5148
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 34.9976
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.6687
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.4237
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.1276
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 70.7119
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.3970
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 35.9975
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.0900
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 38.8575
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 23.9777
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.1934
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.0103
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 35.7466
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.3941
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 34.6405
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.3837
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.2789
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.5210
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.1940
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.2929
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 35.7177
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.4201
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 33.8286
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 15.7936
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 35.9960
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.3585
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.2749
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.4493
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.5280
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.0055
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 35.6735
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 21.9784
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 35.6887
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.5168
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 33.7674
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 12.6754
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6987
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.3650
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.5712
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 29.7103
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 61.6965
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.7134
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0612637996673584
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9753992557525635
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.041314125061035
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9644596576690674
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9422067403793335
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9674242734909058
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9067790508270264
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9497487545013428
distilbert.transformer.layer.1.attention.q_lin.scale_out 2.0170845985412598
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9822043180465698
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9795405864715576
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9653682708740234
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9267244338989258
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9673479795455933
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9100148677825928
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9623349905014038
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0313947200775146
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9576448202133179
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0266146659851074
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9557504653930664
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9229930639266968
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9482643604278564
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9133390188217163
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9484319686889648
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0091195106506348
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9489374160766602
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0385282039642334
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9662821292877197
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.91229248046875
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9263949394226074
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9312078952789307
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9349567890167236
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0537590980529785
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.926956057548523
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.062760829925537
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.95039963722229
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.899548888206482
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9275894165039062
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9300048351287842
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9213910102844238
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9875534772872925
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.948134183883667
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0664865970611572
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9336917400360107
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8987897634506226
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9277005195617676
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8940513134002686
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9552154541015625
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 57.7446
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2181
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.1671
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8663
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.2293
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9489
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2115
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3861
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.3383
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3214
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.3866
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8853
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.6964
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7598
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2538
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.7067
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.8258
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6692
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 56.7386
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.5777
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.6129
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.2336
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5210
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.3786
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.2645
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4216
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.0892
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9664
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.4031
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7915
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.9975
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 54.0654
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5925
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.8357
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.7800
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3869
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.2735
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.8089
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0365
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.7496
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7071
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.3873
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.7937
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.9497
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3837
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.8037
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.9260
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5427
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.342327356338501
distilbert.transformer.layer.0.attention.q_lin.m_in 0.2673478126525879
distilbert.transformer.layer.0.attention.k_lin.m_out 0.33858394622802734
distilbert.transformer.layer.0.attention.k_lin.m_in 0.25746065378189087
distilbert.transformer.layer.0.attention.v_lin.m_out 0.24844586849212646
distilbert.transformer.layer.0.attention.v_lin.m_in 0.2527827024459839
distilbert.transformer.layer.0.attention.out_lin.m_out 0.21927964687347412
distilbert.transformer.layer.0.attention.out_lin.m_in 0.22732524573802948
distilbert.transformer.layer.1.attention.q_lin.m_out 0.2916175127029419
distilbert.transformer.layer.1.attention.q_lin.m_in 0.24687734246253967
distilbert.transformer.layer.1.attention.k_lin.m_out 0.2892614006996155
distilbert.transformer.layer.1.attention.k_lin.m_in 0.2524157166481018
distilbert.transformer.layer.1.attention.v_lin.m_out 0.21986591815948486
distilbert.transformer.layer.1.attention.v_lin.m_in 0.23661664128303528
distilbert.transformer.layer.1.attention.out_lin.m_out 0.22159534692764282
distilbert.transformer.layer.1.attention.out_lin.m_in 0.23196688294410706
distilbert.transformer.layer.2.attention.q_lin.m_out 0.32500144839286804
distilbert.transformer.layer.2.attention.q_lin.m_in 0.2491009682416916
distilbert.transformer.layer.2.attention.k_lin.m_out 0.31428322196006775
distilbert.transformer.layer.2.attention.k_lin.m_in 0.25061824917793274
distilbert.transformer.layer.2.attention.v_lin.m_out 0.2272852063179016
distilbert.transformer.layer.2.attention.v_lin.m_in 0.22342929244041443
distilbert.transformer.layer.2.attention.out_lin.m_out 0.2447393834590912
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23465190827846527
distilbert.transformer.layer.3.attention.q_lin.m_out 0.30937662720680237
distilbert.transformer.layer.3.attention.q_lin.m_in 0.24855463206768036
distilbert.transformer.layer.3.attention.k_lin.m_out 0.32985740900039673
distilbert.transformer.layer.3.attention.k_lin.m_in 0.2726486921310425
distilbert.transformer.layer.3.attention.v_lin.m_out 0.227847620844841
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23485350608825684
distilbert.transformer.layer.3.attention.out_lin.m_out 0.24030470848083496
distilbert.transformer.layer.3.attention.out_lin.m_in 0.24534587562084198
distilbert.transformer.layer.4.attention.q_lin.m_out 0.3439795970916748
distilbert.transformer.layer.4.attention.q_lin.m_in 0.23196381330490112
distilbert.transformer.layer.4.attention.k_lin.m_out 0.3551006317138672
distilbert.transformer.layer.4.attention.k_lin.m_in 0.23938599228858948
distilbert.transformer.layer.4.attention.v_lin.m_out 0.24515563249588013
distilbert.transformer.layer.4.attention.v_lin.m_in 0.23077671229839325
distilbert.transformer.layer.4.attention.out_lin.m_out 0.24853835999965668
distilbert.transformer.layer.4.attention.out_lin.m_in 0.22509190440177917
distilbert.transformer.layer.5.attention.q_lin.m_out 0.29305499792099
distilbert.transformer.layer.5.attention.q_lin.m_in 0.24855470657348633
distilbert.transformer.layer.5.attention.k_lin.m_out 0.3476855754852295
distilbert.transformer.layer.5.attention.k_lin.m_in 0.23641684651374817
distilbert.transformer.layer.5.attention.v_lin.m_out 0.2526254951953888
distilbert.transformer.layer.5.attention.v_lin.m_in 0.22607746720314026
distilbert.transformer.layer.5.attention.out_lin.m_out 0.2052319496870041
distilbert.transformer.layer.5.attention.out_lin.m_in 0.23453399538993835
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 11.6139
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.6670
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 11.5400
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.2944
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 8.8192
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.1880
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 7.8691
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.5758
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.1820
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 8.9425
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.2440
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.1733
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 7.9383
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.6212
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.1016
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.6380
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.2154
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 8.9758
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.0125
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.1251
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.3453
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.2017
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 8.8362
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.7451
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 10.8372
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.2071
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.4118
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.7081
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.3508
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 8.9906
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.6766
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.3606
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.0146
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.5764
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.1504
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.6597
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.9001
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.9268
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 9.0088
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 8.8844
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.5646
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.1838
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 11.7903
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.8605
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4302
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.7683
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.6707
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.8617

Set dropout to more reasonable 0.1¶

Once A and B are well-trained, we can reduce dropout (avoid dropout = 0.0 at this stage: this leads to all LoRA adaptors dying out):

  • Not to remove B’s role, but to let A+B synthesise.

  • This would allow the full LoRA (and DDoRA) path to operate unimpeded, leveraging both learned subspaces.

Use different LRs for lora.A and lora.B¶

In [8]:
def set_all_lora_dropout(model, new_dropout_rate):
    for module in model.modules():
        if isinstance(module, LoRALayer):
            module.dropout.p = new_dropout_rate

def print_dropout_rates(model):
    for name, module in model.named_modules():
        if isinstance(module, LoRALayer):
            print(f"{name}.dropout.p = {module.dropout.p}")

def split_lora_dora_params(model):
    lora_A_params = []
    lora_B_params = []
    m_params = []
    scale_params = []

    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        if "lora.A" in name:
            lora_A_params.append(param)
        elif "lora.B" in name:
            lora_B_params.append(param)
        elif name.endswith("m_in") or name.endswith("m_out"):
            m_params.append(param)
        elif "scale" in name:
            scale_params.append(param)

    return {
        "lora_A": lora_A_params,
        "lora_B": lora_B_params,
        "m": m_params,
        "scale": scale_params,
    }

def create_custom_optimizer(model, base_lr=1e-4, lr_B_scale=10.0, lr_scale_params=0.2, weight_decay=0.01):
    param_groups = split_lora_dora_params(model)

    optimizer = torch.optim.AdamW([
        {"params": param_groups["lora_A"], "lr": base_lr},
        {"params": param_groups["lora_B"], "lr": base_lr * lr_B_scale},
        {"params": param_groups["m"], "lr": base_lr},
        {"params": param_groups["scale"], "lr": base_lr * lr_scale_params},
    ], weight_decay=weight_decay)

    return optimizer
In [9]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)

dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############



from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=2, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0,
    report_to="none",
    log_level="error"
)

trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)

trainer_ddora_all_attn.optimizer = create_custom_optimizer(
    trainer_ddora_all_attn.model,
    base_lr=3e-3, ###########
    lr_B_scale=0.5, #############
    lr_scale_params=0.75, #########
    weight_decay=1e-5,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

# Example output
for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[1564/1564 2:18:14, Epoch 2/2]
Step Training Loss Validation Loss Accuracy F1
50 0.190300 0.189563 0.926400 0.926298
100 0.154800 0.200476 0.923200 0.923212
150 0.168100 0.203508 0.928000 0.927841
200 0.126000 0.215565 0.924000 0.923856
250 0.144500 0.201292 0.924000 0.923902
300 0.145700 0.195318 0.919200 0.919206
350 0.153700 0.189462 0.925600 0.925583
400 0.116700 0.207675 0.923200 0.923163
450 0.128700 0.215511 0.920000 0.920012
500 0.138200 0.208557 0.923200 0.923163
550 0.142200 0.205139 0.926400 0.926188
600 0.146700 0.206250 0.920800 0.920666
650 0.111200 0.211345 0.924000 0.923930
700 0.109600 0.223709 0.923200 0.922996
750 0.125400 0.221195 0.921600 0.921427
800 0.122200 0.213449 0.920000 0.920012
850 0.093700 0.222432 0.922400 0.922356
900 0.079700 0.247886 0.923200 0.922996
950 0.093800 0.240048 0.920000 0.919873
1000 0.085900 0.235864 0.923200 0.923188
1050 0.096600 0.234580 0.924800 0.924666
1100 0.107200 0.225129 0.923200 0.923136
1150 0.079500 0.227587 0.925600 0.925532
1200 0.097200 0.227462 0.924800 0.924788
1250 0.111300 0.223540 0.924000 0.923957
1300 0.119000 0.221720 0.924000 0.923916
1350 0.132400 0.215398 0.928800 0.928694
1400 0.132600 0.211440 0.924000 0.923944
1450 0.140300 0.210785 0.924800 0.924764
1500 0.149600 0.210460 0.926400 0.926326
1550 0.156600 0.209057 0.924800 0.924751

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 559443 KiB |  12301 MiB | 242076 GiB | 242076 GiB |
|       from large pool | 541440 KiB |  12236 MiB | 241101 GiB | 241101 GiB |
|       from small pool |  18003 KiB |     67 MiB |    974 GiB |    974 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 559443 KiB |  12301 MiB | 242076 GiB | 242076 GiB |
|       from large pool | 541440 KiB |  12236 MiB | 241101 GiB | 241101 GiB |
|       from small pool |  18003 KiB |     67 MiB |    974 GiB |    974 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 557230 KiB |  12297 MiB | 241659 GiB | 241659 GiB |
|       from large pool | 539228 KiB |  12231 MiB | 240689 GiB | 240689 GiB |
|       from small pool |  18002 KiB |     67 MiB |    970 GiB |    970 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  12554 MiB |  12590 MiB |  24540 MiB |  11986 MiB |
|       from large pool |  12484 MiB |  12520 MiB |  24408 MiB |  11924 MiB |
|       from small pool |     70 MiB |     70 MiB |    132 MiB |     62 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory |  75437 KiB | 146440 KiB |  21097 GiB |  21097 GiB |
|       from large pool |  68864 KiB | 141056 KiB |  20048 GiB |  20048 GiB |
|       from small pool |   6573 KiB |  43066 KiB |   1049 GiB |   1049 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     860    |    1318    |   23222 K  |   23222 K  |
|       from large pool |      80    |     298    |    6133 K  |    6133 K  |
|       from small pool |     780    |    1235    |   17089 K  |   17088 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     860    |    1318    |   23222 K  |   23222 K  |
|       from large pool |      80    |     298    |    6133 K  |    6133 K  |
|       from small pool |     780    |    1235    |   17089 K  |   17088 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     270    |     271    |     515    |     245    |
|       from large pool |     235    |     236    |     449    |     214    |
|       from small pool |      35    |      35    |      66    |      31    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      38    |      90    |   10044 K  |   10044 K  |
|       from large pool |      18    |      24    |     890 K  |     890 K  |
|       from small pool |      20    |      73    |    9153 K  |    9153 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2775, |B|=0.1617, |∇A|=2.004e-05, |∇B|=1.97e-05, |LoRA(x)|=2.146e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2666, |B|=0.1605, |∇A|=8.605e-06, |∇B|=1.567e-05, |LoRA(x)|=2.343e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2632, |B|=0.134, |∇A|=1.352e-05, |∇B|=2.356e-05, |LoRA(x)|=2.373e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.2496, |B|=0.1311, |∇A|=2.109e-05, |∇B|=5.005e-05, |LoRA(x)|=1.2e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.2715, |B|=0.1632, |∇A|=3.528e-05, |∇B|=1.96e-05, |LoRA(x)|=5.078e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.243, |B|=0.1318, |∇A|=1.097e-05, |∇B|=5.569e-05, |LoRA(x)|=1.701e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2634, |B|=0.1393, |∇A|=1.438e-05, |∇B|=1.748e-05, |LoRA(x)|=1.95e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2621, |B|=0.1571, |∇A|=1.328e-05, |∇B|=2.068e-05, |LoRA(x)|=1.912e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2403, |B|=0.1252, |∇A|=1.612e-05, |∇B|=3.199e-05, |LoRA(x)|=1.786e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2473, |B|=0.1195, |∇A|=2.096e-05, |∇B|=5.373e-05, |LoRA(x)|=1.431e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2604, |B|=0.1617, |∇A|=3.089e-05, |∇B|=2.576e-05, |LoRA(x)|=4.777e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.264, |B|=0.1358, |∇A|=1.869e-05, |∇B|=7.984e-05, |LoRA(x)|=1.356e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2679, |B|=0.1572, |∇A|=1.893e-05, |∇B|=2.563e-05, |LoRA(x)|=2.383e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2658, |B|=0.1594, |∇A|=1.879e-05, |∇B|=2.583e-05, |LoRA(x)|=2.269e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2404, |B|=0.1073, |∇A|=6.702e-06, |∇B|=2.483e-05, |LoRA(x)|=4.308e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2511, |B|=0.1343, |∇A|=3.909e-05, |∇B|=6.579e-05, |LoRA(x)|=1.082e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.2761, |B|=0.1607, |∇A|=2.807e-05, |∇B|=2.521e-05, |LoRA(x)|=6.752e+04, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2535, |B|=0.1248, |∇A|=1.891e-05, |∇B|=7.665e-05, |LoRA(x)|=1.603e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2614, |B|=0.1661, |∇A|=1.345e-05, |∇B|=2.416e-05, |LoRA(x)|=2.479e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.279, |B|=0.1721, |∇A|=2.81e-05, |∇B|=3.036e-05, |LoRA(x)|=2.496e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2532, |B|=0.1227, |∇A|=1.519e-05, |∇B|=2.587e-05, |LoRA(x)|=2.351e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2567, |B|=0.1261, |∇A|=4.712e-05, |∇B|=3.786e-05, |LoRA(x)|=1.333e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2508, |B|=0.1454, |∇A|=2.706e-05, |∇B|=2.267e-05, |LoRA(x)|=4.743e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2494, |B|=0.1191, |∇A|=1.207e-05, |∇B|=4.559e-05, |LoRA(x)|=2.457e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.2541, |B|=0.162, |∇A|=1.095e-05, |∇B|=2.543e-05, |LoRA(x)|=3.06e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2586, |B|=0.1548, |∇A|=2.646e-05, |∇B|=2.163e-05, |LoRA(x)|=2.408e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.242, |B|=0.1145, |∇A|=1.132e-05, |∇B|=1.651e-05, |LoRA(x)|=2.372e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2577, |B|=0.1182, |∇A|=3.214e-05, |∇B|=3.197e-05, |LoRA(x)|=1.449e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.2556, |B|=0.1342, |∇A|=1.524e-05, |∇B|=1.086e-05, |LoRA(x)|=5.044e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2387, |B|=0.1068, |∇A|=2.781e-06, |∇B|=1.757e-05, |LoRA(x)|=6.574e+04, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2589, |B|=0.1577, |∇A|=1.034e-05, |∇B|=2.488e-05, |LoRA(x)|=3.877e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2566, |B|=0.1382, |∇A|=2.165e-05, |∇B|=1.07e-05, |LoRA(x)|=3.278e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2421, |B|=0.09396, |∇A|=8.988e-06, |∇B|=9.457e-06, |LoRA(x)|=2.753e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.25, |B|=0.0986, |∇A|=1.875e-05, |∇B|=1.969e-05, |LoRA(x)|=1.393e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2301, |B|=0.107, |∇A|=4.039e-06, |∇B|=3.966e-06, |LoRA(x)|=7.941e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2232, |B|=0.09474, |∇A|=2.928e-07, |∇B|=6.926e-06, |LoRA(x)|=1.233e+05, B≠0=12288

Training summary¶

  1. Gradients aren't collapsing: ∇A and ∇B are both healthy (all gradient norms in the ~1e-5 to 7e-5 range) — no vanishing or explosion.
  2. B norms are consistently smaller than A (~0.22–0.28 range (A) and ~0.1–0.17 range (B)), and we applied lr_B_scale = 0.5, which kept B updates more conservative
  3. |LoRA(x)| is largest in FFN layers. Layer 5 shows larger activations in LoRA attention paths, especially in q_lin. Even worse: Layer 5: |∇B| = 2.488e-05 > Layer 0: 1.97e-05 - which means Layer 5 is still actively training. This makes freezing of layer 5 weights premature, - in the next section. Please see it as demo only, or retrain the model yourself without layer 5 freezing!
In [10]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.27467310428619385
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.16087943315505981
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.26493778824806213
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.15994273126125336
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.26134049892425537
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.13279186189174652
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.24835559725761414
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.13021254539489746
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2695589065551758
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.1625066101551056
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2408287227153778
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.13101235032081604
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2614685893058777
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.1385265290737152
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.260200560092926
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15645989775657654
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2383284568786621
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12416227906942368
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.24609380960464478
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.1184653490781784
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2583957612514496
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.16097836196422577
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.26195210218429565
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13490238785743713
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.26605361700057983
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15654103457927704
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.2632990777492523
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1587207019329071
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23798131942749023
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.1060485690832138
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24978193640708923
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.1334354281425476
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2741438150405884
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1599275767803192
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.251351535320282
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12355349957942963
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25958922505378723
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.16537396609783173
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2771756947040558
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.17137376964092255
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2521096467971802
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.12177401036024094
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2551520764827728
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1251428872346878
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24903497099876404
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.14453813433647156
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.24802027642726898
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11787533760070801
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.25245609879493713
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16117650270462036
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2567717432975769
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.15406717360019684
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24032878875732422
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11345615983009338
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2565303146839142
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11726316064596176
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2542564272880554
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13327965140342712
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2369489073753357
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.10564354062080383
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.25673991441726685
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1569884866476059
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2547406554222107
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1376175880432129
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24085545539855957
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09300228208303452
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24862870573997498
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09722810983657837
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22779500484466553
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1058943122625351
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22217977046966553
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09405812621116638
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.6747
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.6266
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.3332
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.4854
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.7740
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.9591
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 35.0878
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.5062
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 38.2489
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.9634
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 68.6745
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.7223
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.8761
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.7546
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.6833
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 22.0810
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 33.4225
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.5674
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.6941
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.7691
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 36.3546
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.4047
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 74.4587
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.2996
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.3979
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.8745
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.9386
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.1939
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.2852
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.8303
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 35.3576
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.9281
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.8654
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.5287
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 71.2674
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.6604
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.5726
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.2764
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 39.2914
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 24.1566
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.6481
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.2679
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.1505
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.6628
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 35.1100
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.8434
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.7395
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.8413
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.6301
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.4937
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 36.1941
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.6074
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.1606
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 16.1095
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.5053
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.7102
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.7436
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.9658
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.8764
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.3375
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 36.0503
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 22.1811
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 36.0766
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.7476
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.1035
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 13.0682
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 35.0975
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.7873
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.9677
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 30.2731
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 62.3751
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.9422
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.079176187515259
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.976986289024353
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0598769187927246
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.963384985923767
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9461816549301147
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9646905660629272
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9072974920272827
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9477274417877197
distilbert.transformer.layer.1.attention.q_lin.scale_out 2.02653169631958
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9823664426803589
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9907326698303223
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9645729064941406
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9299085140228271
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.964238166809082
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9065513610839844
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.956520915031433
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.037914752960205
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9566318988800049
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.038102865219116
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.958008050918579
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9164390563964844
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9445397853851318
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9135068655014038
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9422650337219238
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0179243087768555
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9496374130249023
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0496463775634766
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9652724266052246
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9071924686431885
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.922563910484314
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9339063167572021
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9289063215255737
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0540103912353516
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.921581506729126
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.065849781036377
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9476207494735718
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8906574249267578
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9181361198425293
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.92246413230896
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9140986204147339
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.989418387413025
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9430391788482666
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071225643157959
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9252580404281616
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8846511840820312
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9164375066757202
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.880750060081482
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9473273754119873
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.2769
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2697
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7125
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8566
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3819
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.8937
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2645
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3593
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.6280
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3418
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7236
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8762
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.8090
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.6914
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.1904
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.5742
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 57.0324
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6576
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.0930
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.6503
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.4667
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.1525
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5581
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.2392
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.5301
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4601
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.4297
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9576
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.2929
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7103
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.1205
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.9308
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.6259
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7231
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.8851
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3364
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.0858
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.5845
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.8456
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.5895
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7921
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.2675
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9446
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.7487
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.0761
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.5358
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.6063
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.3667
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.3690018653869629
distilbert.transformer.layer.0.attention.q_lin.m_in 0.27579522132873535
distilbert.transformer.layer.0.attention.k_lin.m_out 0.3665243089199066
distilbert.transformer.layer.0.attention.k_lin.m_in 0.2646521329879761
distilbert.transformer.layer.0.attention.v_lin.m_out 0.26364845037460327
distilbert.transformer.layer.0.attention.v_lin.m_in 0.25755858421325684
distilbert.transformer.layer.0.attention.out_lin.m_out 0.23036295175552368
distilbert.transformer.layer.0.attention.out_lin.m_in 0.2361956685781479
distilbert.transformer.layer.1.attention.q_lin.m_out 0.30955392122268677
distilbert.transformer.layer.1.attention.q_lin.m_in 0.2551559805870056
distilbert.transformer.layer.1.attention.k_lin.m_out 0.3091847002506256
distilbert.transformer.layer.1.attention.k_lin.m_in 0.2579241394996643
distilbert.transformer.layer.1.attention.v_lin.m_out 0.23248517513275146
distilbert.transformer.layer.1.attention.v_lin.m_in 0.24111609160900116
distilbert.transformer.layer.1.attention.out_lin.m_out 0.22805188596248627
distilbert.transformer.layer.1.attention.out_lin.m_in 0.23579005897045135
distilbert.transformer.layer.2.attention.q_lin.m_out 0.33838027715682983
distilbert.transformer.layer.2.attention.q_lin.m_in 0.25477975606918335
distilbert.transformer.layer.2.attention.k_lin.m_out 0.3349052369594574
distilbert.transformer.layer.2.attention.k_lin.m_in 0.26004013419151306
distilbert.transformer.layer.2.attention.v_lin.m_out 0.23051267862319946
distilbert.transformer.layer.2.attention.v_lin.m_in 0.22866296768188477
distilbert.transformer.layer.2.attention.out_lin.m_out 0.25453248620033264
distilbert.transformer.layer.2.attention.out_lin.m_in 0.2382659763097763
distilbert.transformer.layer.3.attention.q_lin.m_out 0.32573193311691284
distilbert.transformer.layer.3.attention.q_lin.m_in 0.2568552494049072
distilbert.transformer.layer.3.attention.k_lin.m_out 0.3485237956047058
distilbert.transformer.layer.3.attention.k_lin.m_in 0.27685362100601196
distilbert.transformer.layer.3.attention.v_lin.m_out 0.23205050826072693
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23978719115257263
distilbert.transformer.layer.3.attention.out_lin.m_out 0.25406739115715027
distilbert.transformer.layer.3.attention.out_lin.m_in 0.2500074505805969
distilbert.transformer.layer.4.attention.q_lin.m_out 0.3506551682949066
distilbert.transformer.layer.4.attention.q_lin.m_in 0.23591524362564087
distilbert.transformer.layer.4.attention.k_lin.m_out 0.3639422357082367
distilbert.transformer.layer.4.attention.k_lin.m_in 0.2439534068107605
distilbert.transformer.layer.4.attention.v_lin.m_out 0.24837157130241394
distilbert.transformer.layer.4.attention.v_lin.m_in 0.2317391335964203
distilbert.transformer.layer.4.attention.out_lin.m_out 0.2500099837779999
distilbert.transformer.layer.4.attention.out_lin.m_in 0.22958096861839294
distilbert.transformer.layer.5.attention.q_lin.m_out 0.3036739230155945
distilbert.transformer.layer.5.attention.q_lin.m_in 0.251531720161438
distilbert.transformer.layer.5.attention.k_lin.m_out 0.3580213785171509
distilbert.transformer.layer.5.attention.k_lin.m_in 0.23563992977142334
distilbert.transformer.layer.5.attention.v_lin.m_out 0.25412318110466003
distilbert.transformer.layer.5.attention.v_lin.m_in 0.2263183444738388
distilbert.transformer.layer.5.attention.out_lin.m_out 0.20213589072227478
distilbert.transformer.layer.5.attention.out_lin.m_in 0.23632609844207764
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 12.4339
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.8294
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 12.3191
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4829
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.2940
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3087
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.2266
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8123
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.6891
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.1400
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.8123
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.3261
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.2857
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.7601
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.2666
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7566
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.6090
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.1298
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.6100
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.3270
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.4656
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.3839
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.1117
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.8582
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.2639
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.3725
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 11.9785
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.8492
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.4670
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1364
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.1048
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.4892
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.2100
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.7015
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.3922
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.8146
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.9844
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 8.9990
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 8.9444
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.0421
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.8569
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.2505
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.0845
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.9086
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4463
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.8336
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.7024
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.9938

Freeze FFN layers, freeze attention in the last layer¶

Adapt LRs¶

In [11]:
def freeze_ddora_layer(model, layer_idx):
    target_prefix = f"distilbert.transformer.layer.{layer_idx}."
    for name, param in model.named_parameters():
        if name.startswith(target_prefix):
            param.requires_grad = False
                
for i in range(6): 
    lin1 = model_ddora_all_attn.distilbert.transformer.layer[i].ffn.lin1
    lin2 = model_ddora_all_attn.distilbert.transformer.layer[i].ffn.lin2
    
    for param in lin1.parameters():
        param.requires_grad = False
    for param in lin2.parameters():
        param.requires_grad = False

freeze_ddora_layer(model_ddora_all_attn, layer_idx=5)
for name, param in model_ddora_all_attn.named_parameters():
    if param.requires_grad:
        print(name)

print_dropout_rates(model_ddora_all_attn)

# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#(likely due to unregularised overfitting and gradient collapse on low-magnitude params)
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)

dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############



from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=2, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0,
    report_to="none",
    log_level="error"
)

trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)

trainer_ddora_all_attn.optimizer = create_custom_optimizer(
    trainer_ddora_all_attn.model,
    base_lr=1e-3, ###########
    lr_B_scale=1.0, #############
    lr_scale_params=1.0, #########
    weight_decay=1e-5,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.m_out
distilbert.transformer.layer.0.attention.q_lin.m_in
distilbert.transformer.layer.0.attention.q_lin.scale_out
distilbert.transformer.layer.0.attention.q_lin.scale_in
distilbert.transformer.layer.0.attention.q_lin.lora.A
distilbert.transformer.layer.0.attention.q_lin.lora.B
distilbert.transformer.layer.0.attention.k_lin.m_out
distilbert.transformer.layer.0.attention.k_lin.m_in
distilbert.transformer.layer.0.attention.k_lin.scale_out
distilbert.transformer.layer.0.attention.k_lin.scale_in
distilbert.transformer.layer.0.attention.k_lin.lora.A
distilbert.transformer.layer.0.attention.k_lin.lora.B
distilbert.transformer.layer.0.attention.v_lin.m_out
distilbert.transformer.layer.0.attention.v_lin.m_in
distilbert.transformer.layer.0.attention.v_lin.scale_out
distilbert.transformer.layer.0.attention.v_lin.scale_in
distilbert.transformer.layer.0.attention.v_lin.lora.A
distilbert.transformer.layer.0.attention.v_lin.lora.B
distilbert.transformer.layer.0.attention.out_lin.m_out
distilbert.transformer.layer.0.attention.out_lin.m_in
distilbert.transformer.layer.0.attention.out_lin.scale_out
distilbert.transformer.layer.0.attention.out_lin.scale_in
distilbert.transformer.layer.0.attention.out_lin.lora.A
distilbert.transformer.layer.0.attention.out_lin.lora.B
distilbert.transformer.layer.1.attention.q_lin.m_out
distilbert.transformer.layer.1.attention.q_lin.m_in
distilbert.transformer.layer.1.attention.q_lin.scale_out
distilbert.transformer.layer.1.attention.q_lin.scale_in
distilbert.transformer.layer.1.attention.q_lin.lora.A
distilbert.transformer.layer.1.attention.q_lin.lora.B
distilbert.transformer.layer.1.attention.k_lin.m_out
distilbert.transformer.layer.1.attention.k_lin.m_in
distilbert.transformer.layer.1.attention.k_lin.scale_out
distilbert.transformer.layer.1.attention.k_lin.scale_in
distilbert.transformer.layer.1.attention.k_lin.lora.A
distilbert.transformer.layer.1.attention.k_lin.lora.B
distilbert.transformer.layer.1.attention.v_lin.m_out
distilbert.transformer.layer.1.attention.v_lin.m_in
distilbert.transformer.layer.1.attention.v_lin.scale_out
distilbert.transformer.layer.1.attention.v_lin.scale_in
distilbert.transformer.layer.1.attention.v_lin.lora.A
distilbert.transformer.layer.1.attention.v_lin.lora.B
distilbert.transformer.layer.1.attention.out_lin.m_out
distilbert.transformer.layer.1.attention.out_lin.m_in
distilbert.transformer.layer.1.attention.out_lin.scale_out
distilbert.transformer.layer.1.attention.out_lin.scale_in
distilbert.transformer.layer.1.attention.out_lin.lora.A
distilbert.transformer.layer.1.attention.out_lin.lora.B
distilbert.transformer.layer.2.attention.q_lin.m_out
distilbert.transformer.layer.2.attention.q_lin.m_in
distilbert.transformer.layer.2.attention.q_lin.scale_out
distilbert.transformer.layer.2.attention.q_lin.scale_in
distilbert.transformer.layer.2.attention.q_lin.lora.A
distilbert.transformer.layer.2.attention.q_lin.lora.B
distilbert.transformer.layer.2.attention.k_lin.m_out
distilbert.transformer.layer.2.attention.k_lin.m_in
distilbert.transformer.layer.2.attention.k_lin.scale_out
distilbert.transformer.layer.2.attention.k_lin.scale_in
distilbert.transformer.layer.2.attention.k_lin.lora.A
distilbert.transformer.layer.2.attention.k_lin.lora.B
distilbert.transformer.layer.2.attention.v_lin.m_out
distilbert.transformer.layer.2.attention.v_lin.m_in
distilbert.transformer.layer.2.attention.v_lin.scale_out
distilbert.transformer.layer.2.attention.v_lin.scale_in
distilbert.transformer.layer.2.attention.v_lin.lora.A
distilbert.transformer.layer.2.attention.v_lin.lora.B
distilbert.transformer.layer.2.attention.out_lin.m_out
distilbert.transformer.layer.2.attention.out_lin.m_in
distilbert.transformer.layer.2.attention.out_lin.scale_out
distilbert.transformer.layer.2.attention.out_lin.scale_in
distilbert.transformer.layer.2.attention.out_lin.lora.A
distilbert.transformer.layer.2.attention.out_lin.lora.B
distilbert.transformer.layer.3.attention.q_lin.m_out
distilbert.transformer.layer.3.attention.q_lin.m_in
distilbert.transformer.layer.3.attention.q_lin.scale_out
distilbert.transformer.layer.3.attention.q_lin.scale_in
distilbert.transformer.layer.3.attention.q_lin.lora.A
distilbert.transformer.layer.3.attention.q_lin.lora.B
distilbert.transformer.layer.3.attention.k_lin.m_out
distilbert.transformer.layer.3.attention.k_lin.m_in
distilbert.transformer.layer.3.attention.k_lin.scale_out
distilbert.transformer.layer.3.attention.k_lin.scale_in
distilbert.transformer.layer.3.attention.k_lin.lora.A
distilbert.transformer.layer.3.attention.k_lin.lora.B
distilbert.transformer.layer.3.attention.v_lin.m_out
distilbert.transformer.layer.3.attention.v_lin.m_in
distilbert.transformer.layer.3.attention.v_lin.scale_out
distilbert.transformer.layer.3.attention.v_lin.scale_in
distilbert.transformer.layer.3.attention.v_lin.lora.A
distilbert.transformer.layer.3.attention.v_lin.lora.B
distilbert.transformer.layer.3.attention.out_lin.m_out
distilbert.transformer.layer.3.attention.out_lin.m_in
distilbert.transformer.layer.3.attention.out_lin.scale_out
distilbert.transformer.layer.3.attention.out_lin.scale_in
distilbert.transformer.layer.3.attention.out_lin.lora.A
distilbert.transformer.layer.3.attention.out_lin.lora.B
distilbert.transformer.layer.4.attention.q_lin.m_out
distilbert.transformer.layer.4.attention.q_lin.m_in
distilbert.transformer.layer.4.attention.q_lin.scale_out
distilbert.transformer.layer.4.attention.q_lin.scale_in
distilbert.transformer.layer.4.attention.q_lin.lora.A
distilbert.transformer.layer.4.attention.q_lin.lora.B
distilbert.transformer.layer.4.attention.k_lin.m_out
distilbert.transformer.layer.4.attention.k_lin.m_in
distilbert.transformer.layer.4.attention.k_lin.scale_out
distilbert.transformer.layer.4.attention.k_lin.scale_in
distilbert.transformer.layer.4.attention.k_lin.lora.A
distilbert.transformer.layer.4.attention.k_lin.lora.B
distilbert.transformer.layer.4.attention.v_lin.m_out
distilbert.transformer.layer.4.attention.v_lin.m_in
distilbert.transformer.layer.4.attention.v_lin.scale_out
distilbert.transformer.layer.4.attention.v_lin.scale_in
distilbert.transformer.layer.4.attention.v_lin.lora.A
distilbert.transformer.layer.4.attention.v_lin.lora.B
distilbert.transformer.layer.4.attention.out_lin.m_out
distilbert.transformer.layer.4.attention.out_lin.m_in
distilbert.transformer.layer.4.attention.out_lin.scale_out
distilbert.transformer.layer.4.attention.out_lin.scale_in
distilbert.transformer.layer.4.attention.out_lin.lora.A
distilbert.transformer.layer.4.attention.out_lin.lora.B
pre_classifier.weight
pre_classifier.bias
classifier.weight
classifier.bias
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[1564/1564 1:24:45, Epoch 2/2]
Step Training Loss Validation Loss Accuracy F1
50 0.129500 0.192208 0.928000 0.928000
100 0.102500 0.195713 0.926400 0.926365
150 0.098800 0.205262 0.921600 0.921644
200 0.070700 0.209579 0.924000 0.923957
250 0.086800 0.218905 0.918400 0.918477
300 0.088500 0.215237 0.926400 0.926352
350 0.098500 0.214950 0.924000 0.924028
400 0.107200 0.215835 0.920800 0.920840
450 0.129700 0.212644 0.924000 0.924006
500 0.139400 0.205913 0.921600 0.921575
550 0.139600 0.202001 0.927200 0.927092
600 0.147400 0.199926 0.924800 0.924800
650 0.110000 0.202905 0.925600 0.925459
700 0.107900 0.204202 0.924000 0.923970
750 0.123700 0.205488 0.927200 0.927133
800 0.123000 0.207789 0.920000 0.920024
850 0.094700 0.208785 0.921600 0.921612
900 0.090300 0.210170 0.926400 0.926326
950 0.100800 0.212387 0.924800 0.924724
1000 0.088100 0.213624 0.926400 0.926365
1050 0.101400 0.213821 0.926400 0.926365
1100 0.109700 0.213743 0.925600 0.925545
1150 0.083400 0.214692 0.926400 0.926326
1200 0.105700 0.214382 0.925600 0.925545
1250 0.114500 0.213811 0.924800 0.924738
1300 0.119800 0.213729 0.926400 0.926326
1350 0.142000 0.212341 0.926400 0.926326
1400 0.137100 0.211481 0.926400 0.926326
1450 0.149900 0.210727 0.927200 0.927133
1500 0.162000 0.210390 0.927200 0.927133
1550 0.163200 0.210198 0.927200 0.927133

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 552099 KiB |  12301 MiB | 354221 GiB | 354221 GiB |
|       from large pool | 541440 KiB |  12236 MiB | 352777 GiB | 352776 GiB |
|       from small pool |  10659 KiB |     67 MiB |   1444 GiB |   1444 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 552099 KiB |  12301 MiB | 354221 GiB | 354221 GiB |
|       from large pool | 541440 KiB |  12236 MiB | 352777 GiB | 352776 GiB |
|       from small pool |  10659 KiB |     67 MiB |   1444 GiB |   1444 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 549886 KiB |  12297 MiB | 353679 GiB | 353679 GiB |
|       from large pool | 539228 KiB |  12231 MiB | 352242 GiB | 352241 GiB |
|       from small pool |  10658 KiB |     67 MiB |   1437 GiB |   1437 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   7954 MiB |  12590 MiB |  31890 MiB |  23936 MiB |
|       from large pool |   7904 MiB |  12520 MiB |  31716 MiB |  23812 MiB |
|       from small pool |     50 MiB |     70 MiB |    174 MiB |    124 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory |  76637 KiB | 319790 KiB |  65944 GiB |  65944 GiB |
|       from large pool |  68864 KiB | 314752 KiB |  64384 GiB |  64383 GiB |
|       from small pool |   7773 KiB |  43066 KiB |   1560 GiB |   1560 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     668    |    1318    |   33579 K  |   33579 K  |
|       from large pool |      80    |     298    |    9048 K  |    9048 K  |
|       from small pool |     588    |    1235    |   24531 K  |   24530 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     668    |    1318    |   33579 K  |   33579 K  |
|       from large pool |      80    |     298    |    9048 K  |    9048 K  |
|       from small pool |     588    |    1235    |   24531 K  |   24530 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     154    |     271    |     643    |     489    |
|       from large pool |     129    |     236    |     556    |     427    |
|       from small pool |      25    |      35    |      87    |      62    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      33    |      90    |   14904 K  |   14904 K  |
|       from large pool |      18    |      24    |    1709 K  |    1709 K  |
|       from small pool |      15    |      73    |   13194 K  |   13194 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2769, |B|=0.1618, |∇A|=2.095e-05, |∇B|=1.97e-05, |LoRA(x)|=2.076e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2662, |B|=0.1606, |∇A|=8.706e-06, |∇B|=1.546e-05, |LoRA(x)|=2.29e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2628, |B|=0.1347, |∇A|=1.436e-05, |∇B|=2.381e-05, |LoRA(x)|=2.267e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.2499, |B|=0.1315, |∇A|=2.324e-05, |∇B|=5.055e-05, |LoRA(x)|=1.135e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.2696, |B|=0.1625, |∇A|=0, |∇B|=0, |LoRA(x)|=5.201e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2408, |B|=0.131, |∇A|=0, |∇B|=0, |LoRA(x)|=1.624e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2634, |B|=0.1395, |∇A|=1.536e-05, |∇B|=1.773e-05, |LoRA(x)|=1.814e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2617, |B|=0.1574, |∇A|=1.364e-05, |∇B|=2.045e-05, |LoRA(x)|=1.879e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.24, |B|=0.1254, |∇A|=1.798e-05, |∇B|=3.421e-05, |LoRA(x)|=1.623e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2471, |B|=0.1199, |∇A|=2.339e-05, |∇B|=5.574e-05, |LoRA(x)|=1.364e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2584, |B|=0.161, |∇A|=0, |∇B|=0, |LoRA(x)|=5.053e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.262, |B|=0.1349, |∇A|=0, |∇B|=0, |LoRA(x)|=1.41e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2674, |B|=0.1574, |∇A|=1.91e-05, |∇B|=2.529e-05, |LoRA(x)|=2.3e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2654, |B|=0.1596, |∇A|=1.934e-05, |∇B|=2.576e-05, |LoRA(x)|=2.211e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.239, |B|=0.1072, |∇A|=8.179e-06, |∇B|=2.949e-05, |LoRA(x)|=3.823e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2511, |B|=0.1348, |∇A|=4.259e-05, |∇B|=6.823e-05, |LoRA(x)|=1.03e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.2741, |B|=0.1599, |∇A|=0, |∇B|=0, |LoRA(x)|=7.083e+04, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2514, |B|=0.1236, |∇A|=0, |∇B|=0, |LoRA(x)|=1.92e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2613, |B|=0.1663, |∇A|=1.421e-05, |∇B|=2.463e-05, |LoRA(x)|=2.365e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2784, |B|=0.1723, |∇A|=2.856e-05, |∇B|=2.999e-05, |LoRA(x)|=2.423e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2536, |B|=0.1229, |∇A|=1.677e-05, |∇B|=2.682e-05, |LoRA(x)|=2.236e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2565, |B|=0.1265, |∇A|=5.192e-05, |∇B|=4.08e-05, |LoRA(x)|=1.278e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.249, |B|=0.1445, |∇A|=0, |∇B|=0, |LoRA(x)|=4.909e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.248, |B|=0.1179, |∇A|=0, |∇B|=0, |LoRA(x)|=2.538e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.254, |B|=0.1624, |∇A|=1.096e-05, |∇B|=2.417e-05, |LoRA(x)|=3.029e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2586, |B|=0.155, |∇A|=2.714e-05, |∇B|=2.183e-05, |LoRA(x)|=2.354e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2423, |B|=0.1148, |∇A|=1.434e-05, |∇B|=1.919e-05, |LoRA(x)|=2.227e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2577, |B|=0.1189, |∇A|=3.38e-05, |∇B|=3.305e-05, |LoRA(x)|=1.403e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.2543, |B|=0.1333, |∇A|=0, |∇B|=0, |LoRA(x)|=5.194e+04, B≠0=49151
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2369, |B|=0.1056, |∇A|=0, |∇B|=0, |LoRA(x)|=7.262e+04, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2567, |B|=0.157, |∇A|=0, |∇B|=0, |LoRA(x)|=4.137e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2547, |B|=0.1376, |∇A|=0, |∇B|=0, |LoRA(x)|=3.322e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2409, |B|=0.093, |∇A|=0, |∇B|=0, |LoRA(x)|=2.855e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2486, |B|=0.09723, |∇A|=0, |∇B|=0, |LoRA(x)|=1.485e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2278, |B|=0.1059, |∇A|=0, |∇B|=0, |LoRA(x)|=1.007e+05, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2222, |B|=0.09406, |∇A|=0, |∇B|=0, |LoRA(x)|=1.188e+05, B≠0=12288

Training summary¶

  1. No obvious overfitting or catastrophic drift. Validation loss is slightly noisy but flat. Validation accuracy/F1 score stays around 92.6–92.8% consistently from step ~300 onward.
  2. |LoRA(x)| magnitudes are healthy, ranging from 1e4 to 1e5, |∇A| and |∇B| are non-zero in all layers 0-4.
In [12]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2749776840209961
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.16095973551273346
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2651287913322449
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1600244641304016
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.2614787518978119
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1331043690443039
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2485843151807785
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.13032867014408112
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.2695589065551758
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.1625066101551056
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2408287227153778
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.13101235032081604
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2617260217666626
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.1386014223098755
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2603480815887451
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.15657925605773926
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2385316789150238
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.12427060306072235
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.24628107249736786
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.11874490976333618
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2583957612514496
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.16097836196422577
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.26195210218429565
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.13490238785743713
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2662392556667328
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.15666478872299194
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.2635606527328491
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.15871921181678772
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.23818302154541016
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.10619251430034637
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.24991869926452637
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.1336245983839035
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.2741438150405884
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.1599275767803192
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.251351535320282
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.12355349957942963
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.25989094376564026
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1655236780643463
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2773197889328003
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.1714954972267151
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2522783577442169
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.12189989537000656
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2552739977836609
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1252676546573639
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.24903497099876404
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.14453813433647156
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.24802027642726898
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.11787533760070801
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2526055574417114
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.16136936843395233
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.25712794065475464
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1542537808418274
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24055878818035126
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.11367492377758026
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.25640028715133667
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.11762168258428574
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2542564272880554
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.13327965140342712
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2369489073753357
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.10564354062080383
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.25673991441726685
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1569884866476059
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2547406554222107
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1376175880432129
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24085545539855957
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.09300228208303452
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24862870573997498
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.09722810983657837
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.22779500484466553
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1058943122625351
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.22217977046966553
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.09405812621116638
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 38.7124
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 22.6398
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 37.3590
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 22.5018
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 36.7891
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 18.9860
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 35.1135
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 18.5258
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 38.2489
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 45.9634
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 68.6745
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 18.7223
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 36.9099
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 19.7729
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 36.7043
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 22.0986
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 33.4483
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 17.5916
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 34.7149
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 16.7974
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 36.3546
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 45.4047
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 74.4587
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 19.2996
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 37.4187
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 21.8922
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 36.9623
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 22.2086
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 33.3070
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 14.8621
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 35.3756
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 18.9542
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 38.8654
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 45.5287
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 71.2674
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 17.6604
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 36.5975
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 23.2939
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 39.3017
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 24.1747
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 35.6658
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 17.2920
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 36.1697
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 17.6819
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 35.1100
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 40.8434
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 70.7395
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 16.8413
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 35.6475
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 22.5161
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 36.2313
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 21.6272
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.1842
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 16.1350
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 36.4880
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 16.7542
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 35.7436
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 37.9658
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 66.8764
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 15.3375
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 36.0503
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 22.1811
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 36.0766
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 19.7476
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.1035
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 13.0682
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 35.0975
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 13.7873
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 31.9677
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 30.2731
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 62.3751
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 13.9422
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0837180614471436
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9779415130615234
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.062652111053467
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9639686346054077
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9488314390182495
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9646772146224976
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.909591555595398
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.948154330253601
distilbert.transformer.layer.1.attention.q_lin.scale_out 2.0283894538879395
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9835941791534424
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9929611682891846
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.965177297592163
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.931127905845642
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9647105932235718
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9083741903305054
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.95680832862854
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0393614768981934
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9570300579071045
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0406978130340576
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9585788249969482
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9163254499435425
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9447929859161377
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9148540496826172
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9425309896469116
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.020709991455078
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.950387716293335
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.052197217941284
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9654889106750488
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9081611633300781
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9228265285491943
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9347209930419922
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9294712543487549
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.056077480316162
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9218626022338867
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.069587230682373
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9484262466430664
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8914620876312256
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9186089038848877
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.923946738243103
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.9129976034164429
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.989418387413025
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9430391788482666
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071225643157959
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9252580404281616
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8846511840820312
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9164375066757202
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.880750060081482
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9473273754119873
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.4084
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.2926
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7943
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8718
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.4596
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.8923
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.3308
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 54.3720
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.6834
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.3731
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7892
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 54.8925
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.8451
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 54.7039
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2424
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.5817
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 57.0762
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.6677
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.1677
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 54.6647
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.4624
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 54.1592
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.5978
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 54.2466
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.6097
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4790
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 57.5021
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.9606
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 53.3222
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.7177
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.1458
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.9458
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.6845
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7307
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.9948
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.3578
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 53.1103
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 53.5967
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.8903
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 53.5606
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.7921
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 54.2675
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9446
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.7487
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.0761
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.5358
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 52.6063
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.3667
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.37365245819091797
distilbert.transformer.layer.0.attention.q_lin.m_in 0.2769436240196228
distilbert.transformer.layer.0.attention.k_lin.m_out 0.36940622329711914
distilbert.transformer.layer.0.attention.k_lin.m_in 0.2654905617237091
distilbert.transformer.layer.0.attention.v_lin.m_out 0.2665351629257202
distilbert.transformer.layer.0.attention.v_lin.m_in 0.25776490569114685
distilbert.transformer.layer.0.attention.out_lin.m_out 0.23288491368293762
distilbert.transformer.layer.0.attention.out_lin.m_in 0.23689687252044678
distilbert.transformer.layer.1.attention.q_lin.m_out 0.31158336997032166
distilbert.transformer.layer.1.attention.q_lin.m_in 0.2566670775413513
distilbert.transformer.layer.1.attention.k_lin.m_out 0.3114917278289795
distilbert.transformer.layer.1.attention.k_lin.m_in 0.25877076387405396
distilbert.transformer.layer.1.attention.v_lin.m_out 0.23386649787425995
distilbert.transformer.layer.1.attention.v_lin.m_in 0.24185800552368164
distilbert.transformer.layer.1.attention.out_lin.m_out 0.2300492525100708
distilbert.transformer.layer.1.attention.out_lin.m_in 0.23636046051979065
distilbert.transformer.layer.2.attention.q_lin.m_out 0.34007567167282104
distilbert.transformer.layer.2.attention.q_lin.m_in 0.2554404139518738
distilbert.transformer.layer.2.attention.k_lin.m_out 0.33773207664489746
distilbert.transformer.layer.2.attention.k_lin.m_in 0.2607421278953552
distilbert.transformer.layer.2.attention.v_lin.m_out 0.23071661591529846
distilbert.transformer.layer.2.attention.v_lin.m_in 0.2291429042816162
distilbert.transformer.layer.2.attention.out_lin.m_out 0.2561138868331909
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23889867961406708
distilbert.transformer.layer.3.attention.q_lin.m_out 0.32864949107170105
distilbert.transformer.layer.3.attention.q_lin.m_in 0.2578366696834564
distilbert.transformer.layer.3.attention.k_lin.m_out 0.35121220350265503
distilbert.transformer.layer.3.attention.k_lin.m_in 0.2772657871246338
distilbert.transformer.layer.3.attention.v_lin.m_out 0.23328649997711182
distilbert.transformer.layer.3.attention.v_lin.m_in 0.24039456248283386
distilbert.transformer.layer.3.attention.out_lin.m_out 0.25510165095329285
distilbert.transformer.layer.3.attention.out_lin.m_in 0.2509150505065918
distilbert.transformer.layer.4.attention.q_lin.m_out 0.3528974950313568
distilbert.transformer.layer.4.attention.q_lin.m_in 0.23650650680065155
distilbert.transformer.layer.4.attention.k_lin.m_out 0.3678012490272522
distilbert.transformer.layer.4.attention.k_lin.m_in 0.24499206244945526
distilbert.transformer.layer.4.attention.v_lin.m_out 0.24949286878108978
distilbert.transformer.layer.4.attention.v_lin.m_in 0.2326044738292694
distilbert.transformer.layer.4.attention.out_lin.m_out 0.2516971230506897
distilbert.transformer.layer.4.attention.out_lin.m_in 0.22889690101146698
distilbert.transformer.layer.5.attention.q_lin.m_out 0.3036739230155945
distilbert.transformer.layer.5.attention.q_lin.m_in 0.251531720161438
distilbert.transformer.layer.5.attention.k_lin.m_out 0.3580213785171509
distilbert.transformer.layer.5.attention.k_lin.m_in 0.23563992977142334
distilbert.transformer.layer.5.attention.v_lin.m_out 0.25412318110466003
distilbert.transformer.layer.5.attention.v_lin.m_in 0.2263183444738388
distilbert.transformer.layer.5.attention.out_lin.m_out 0.20213589072227478
distilbert.transformer.layer.5.attention.out_lin.m_in 0.23632609844207764
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 12.5664
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 9.8370
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 12.4071
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 9.4924
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.3731
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 9.3112
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 8.2969
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 8.8274
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 10.7503
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 9.1579
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 10.8811
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.3415
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 8.3309
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 8.7695
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.3219
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 8.7636
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.6643
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.1391
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 11.6859
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 9.3386
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.4619
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 8.3938
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.1581
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 8.8639
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.3407
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.3832
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.0453
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 9.8448
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.5036
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1459
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.1446
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.5005
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.2625
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 8.7035
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 12.5059
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 8.8216
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 9.0152
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.0068
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 8.9888
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.0314
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 10.8569
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.2505
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.0845
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 8.9086
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.4463
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 8.8336
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 7.7024
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.9938
In [ ]: