In [1]:
import torch
import sys
import gc
print(sys.version)
print(f"PyTorch Version: {torch.__version__}")
print(torch.cuda.is_available())
print(torch.cuda.device_count())

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(torch.cuda.get_device_name(0))

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

import bitsandbytes
import peft
import transformers

print(transformers.__version__)

print(f"bitsandbytes version: {bitsandbytes.__version__}")
print(f"peft version: {peft.__version__}")
print(torch.cuda.is_bf16_supported())

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]
PyTorch Version: 2.5.1+cu121
True
1
CUDA Version: 12.1
NVIDIA GeForce RTX 4080 Laptop GPU
4.50.0.dev0
bitsandbytes version: 0.45.3
peft version: 0.15.2.dev0
True
In [2]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset = imdb_dataset.rename_column("label", "labels")
# Split the test set into validation and test sets
test_val_split = imdb_dataset['test'].train_test_split(test_size=0.95, seed=42)
imdb_dataset['validation'] = test_val_split['train']
imdb_dataset['test'] = test_val_split['test']

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# Determine the number of labels
num_labels = len(set(imdb_dataset["train"]["labels"]))
print(f"Number of labels: {num_labels}")

# Load the tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenize the whole dataset, truncate to 384 tokens
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=384)

dataset_encoded = imdb_dataset.map(tokenize, batched=True, batch_size=None)

# Load the pretrained model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))
#print(model)
Number of labels: 2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [3]:
# Helper functions
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}
    
def count_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params, 100 * trainable_params / total_params

def freeze_model_layers(model, unfreeze_pre_classifier=False):
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze LoRA and DoRA-specific params, including lora_norm
    for name, param in model.named_parameters():
        if (
            "lora.A" in name
            or "lora.B" in name
            or "lora_norm" in name  
            or name.endswith(".m")   # For DoRA
            or name.endswith(".m_in") # For DDoRA
            or name.endswith(".m_out") # For DDoRA
            or "scale" in name
        ):
            param.requires_grad = True

    # Unfreeze classifier layer (always)
    for name, param in model.named_parameters():
        if name.startswith("classifier."):
            param.requires_grad = True

    # unfreeze pre-classifier
    if unfreeze_pre_classifier:
        for name, param in model.named_parameters():
            if name.startswith("pre_classifier."):
                param.requires_grad = True
In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha, dropout_rate=0.0):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(1e-4 * torch.randn(rank, out_dim) * std_dev)  # not all zeroes!
        self.alpha = alpha
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        dropped = self.dropout(x @ self.A)
        return self.alpha * (dropped @ self.B)


class LinearWithDoubleDoRA(nn.Module):
    def __init__(self, linear, rank, alpha, scaling_factor=1.0):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())        
        self.m_out = nn.Parameter(torch.randn(1, linear.out_features) * std_dev)
        self.m_in = nn.Parameter(torch.randn(linear.in_features, 1) * std_dev)   
        # Orthogonal initialization for m_out
        #self.m_out = nn.Parameter(torch.empty(1, linear.out_features))
        #nn.init.orthogonal_(self.m_out)
        # Orthogonal initialization for m_in
        #self.m_in = nn.Parameter(torch.empty(linear.in_features, 1))
        #nn.init.orthogonal_(self.m_in)        
        self.scale_out = nn.Parameter(torch.full((1, linear.out_features), float(scaling_factor)))
        self.scale_in = nn.Parameter(torch.full((linear.in_features, 1), float(scaling_factor)))
        self.last_lora_output_norm = 0.0  # For monitoring

    def forward(self, x):
        scaled_x = x * self.scale_in.T * self.m_in.T
        linear_output = self.linear(x)
        lora_output = self.lora(scaled_x)
        lora_output = F.dropout(lora_output, p=0.05) #prevents overfitting to small artifacts, spreads useful signal across more of the low-rank space
        lora_output_norm = lora_output / (lora_output.norm(p=2, dim=1, keepdim=True) + 1e-9)
        self.last_lora_output_norm = lora_output.norm(p=2, dim=-1).mean().item()
        dora_modification = self.scale_out * self.m_out * lora_output_norm
        return linear_output + dora_modification



def inject_ddora_all_attn(model, rank, alpha, scaling_factor=1.0, dropout_rate=0.0, disable_layers=None):
    target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin", "ffn.lin1", "ffn.lin2"]
    #target_layers = ["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"]
    if disable_layers is None:
        disable_layers = []

    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(layer in name for layer in target_layers):
            # Try to extract layer index from names like "transformer.layer.4.attention.q_lin"
            parts = name.split('.')
            layer_idx = None
            for i, part in enumerate(parts):
                if part == "layer" and i + 1 < len(parts):
                    try:
                        layer_idx = int(parts[i + 1])
                        break
                    except ValueError:
                        pass

            if layer_idx is not None and layer_idx in disable_layers:
                continue

            parent_name = name.rsplit('.', 1)[0]
            parent_module = model.get_submodule(parent_name)
            original_linear = getattr(parent_module, name.split('.')[-1])

            ddora_layer = LinearWithDoubleDoRA(original_linear, rank, alpha, scaling_factor)
            ddora_layer.lora.dropout = nn.Dropout(dropout_rate)

            setattr(parent_module, name.split('.')[-1], ddora_layer)

    return model
In [5]:
def monitor_lora_parameters(model, threshold=1e-7):
    monitor = {
        "A_abs_mean": [],
        "B_abs_mean": [],
        "A_grad_mean": [],
        "B_grad_mean": [],
        "lora_output_norm": [],
        "B_nonzero_count": [],
    }
    hooks = []

    for name, module in model.named_modules():
        if hasattr(module, "lora") and hasattr(module.lora, "A") and hasattr(module.lora, "B"):
            A_param = module.lora.A
            B_param = module.lora.B

            # Gradient hooks (directly on nn.Parameter)
            if A_param.requires_grad:
                hooks.append(A_param.register_hook(lambda grad, n=name: monitor["A_grad_mean"].append((n, grad.abs().mean().item()))))
            if B_param.requires_grad:
                hooks.append(B_param.register_hook(lambda grad, n=name: monitor["B_grad_mean"].append((n, grad.abs().mean().item()))))

            # Forward hook for value stats
            def forward_hook(mod, inp, out, n=name):
                A_mean = mod.lora.A.abs().mean().item()
                B_mean = mod.lora.B.abs().mean().item()
                B_nnz = (mod.lora.B.abs() > threshold).sum().item()
                monitor["A_abs_mean"].append((n, A_mean))
                monitor["B_abs_mean"].append((n, B_mean))
                monitor["B_nonzero_count"].append((n, B_nnz))
                monitor["lora_output_norm"].append((n, mod.last_lora_output_norm))

            hooks.append(module.register_forward_hook(forward_hook))

    return hooks, monitor

from transformers import TrainingArguments

def monitor_gradients(model):
    hooks = []
    gradient_history = {}

    for name, param in model.named_parameters():
        if param.requires_grad:
            gradient_history[name] = []

            def get_hook(n):  # capture the name immediately
                def hook(grad):
                    gradient_history[n].append(grad.abs().mean().item())
                return hook

            hooks.append(param.register_hook(get_hook(name)))
    return hooks, gradient_history
In [6]:
learning_rate = 1e-2 #############
dropout = 0.3 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
output_dir_prefix = "finetuned-imdb-"

import copy
torch.manual_seed(137)

model_ddora_all_attn = copy.deepcopy(model)
model_ddora_all_attn = inject_ddora_all_attn(model_ddora_all_attn, lora_rank, lora_alpha, scaling_factor, dropout)
freeze_model_layers(model_ddora_all_attn, unfreeze_pre_classifier=True)

total_params_ddora, trainable_params_ddora, percentage_ddora = count_trainable_parameters(model_ddora_all_attn)
print(f"\nDDoRA (All Attention) - Total parameters: {total_params_ddora:,}")
print(f"DDoRA (All Attention) - Trainable parameters: {trainable_params_ddora:,} ({percentage_ddora:.2f}%)")

# Sanity check
#print("\nTrainable parameters after freezing:")
#for name, param in model_ddora_all_attn.named_parameters():
#    if param.requires_grad:
#        print(name)

from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=3, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0, #####
    report_to="none",
    log_level="error"
)

    
trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

# Example output
for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
DDoRA (All Attention) - Total parameters: 68,448,002
DDoRA (All Attention) - Trainable parameters: 2,085,122 (3.05%)
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[2346/2346 4:14:13, Epoch 3/3]
Step Training Loss Validation Loss Accuracy F1
50 0.601500 0.292667 0.880000 0.880255
100 0.330500 0.289936 0.878400 0.878587
150 0.333400 0.285789 0.880800 0.881023
200 0.268200 0.264306 0.894400 0.894553
250 0.298500 0.240361 0.902400 0.902442
300 0.275700 0.234229 0.906400 0.906162
350 0.270200 0.235252 0.908000 0.907978
400 0.243100 0.264926 0.902400 0.902558
450 0.250700 0.300190 0.896000 0.894957
500 0.281500 0.248224 0.909600 0.909429
550 0.247700 0.252260 0.908000 0.907657
600 0.269500 0.243112 0.911200 0.910913
650 0.237300 0.225605 0.912000 0.911927
700 0.233200 0.235947 0.911200 0.910974
750 0.246900 0.296594 0.901600 0.900834
800 0.247200 0.267171 0.903200 0.902447
850 0.223600 0.224062 0.911200 0.911207
900 0.206000 0.235725 0.914400 0.914495
950 0.233100 0.234277 0.919200 0.919140
1000 0.203900 0.212509 0.919200 0.919154
1050 0.207700 0.219393 0.924000 0.923772
1100 0.202600 0.227999 0.924800 0.924583
1150 0.192400 0.203774 0.922400 0.922417
1200 0.202600 0.218544 0.919200 0.919271
1250 0.197800 0.211557 0.926400 0.926253
1300 0.212400 0.205286 0.920000 0.920093
1350 0.213300 0.200498 0.924000 0.924067
1400 0.205300 0.203650 0.928000 0.927953
1450 0.198100 0.205225 0.918400 0.918446
1500 0.200500 0.195481 0.920000 0.919873
1550 0.198500 0.207493 0.925600 0.925617
1600 0.167500 0.255866 0.918400 0.918495
1650 0.158800 0.212292 0.924000 0.923970
1700 0.168400 0.229653 0.914400 0.914528
1750 0.161200 0.244941 0.922400 0.922314
1800 0.169800 0.212388 0.924800 0.924738
1850 0.174600 0.217708 0.921600 0.921634
1900 0.146000 0.218510 0.923200 0.923200
1950 0.174400 0.204068 0.927200 0.927120
2000 0.163500 0.209466 0.913600 0.913740
2050 0.162600 0.200313 0.926400 0.926451
2100 0.169100 0.197281 0.928000 0.928021
2150 0.117800 0.217998 0.922400 0.922468
2200 0.153300 0.205229 0.931200 0.931131
2250 0.173100 0.197001 0.928000 0.927953
2300 0.157300 0.195606 0.927200 0.927194

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 564070 KiB |  12777 MiB | 196479 GiB | 196478 GiB |
|       from large pool | 546048 KiB |  12712 MiB | 195749 GiB | 195748 GiB |
|       from small pool |  18022 KiB |     67 MiB |    730 GiB |    730 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 564070 KiB |  12777 MiB | 196479 GiB | 196478 GiB |
|       from large pool | 546048 KiB |  12712 MiB | 195749 GiB | 195748 GiB |
|       from small pool |  18022 KiB |     67 MiB |    730 GiB |    730 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 561856 KiB |  12774 MiB | 196296 GiB | 196296 GiB |
|       from large pool | 543836 KiB |  12708 MiB | 195570 GiB | 195569 GiB |
|       from small pool |  18020 KiB |     67 MiB |    726 GiB |    726 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  13058 MiB |  13058 MiB |  13058 MiB |      0 B   |
|       from large pool |  12988 MiB |  12988 MiB |  12988 MiB |      0 B   |
|       from small pool |     70 MiB |     70 MiB |     70 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Non-releasable memory | 183450 KiB | 337456 KiB |  31846 GiB |  31846 GiB |
|       from large pool | 174848 KiB | 305536 KiB |  31061 GiB |  31061 GiB |
|       from small pool |   8602 KiB |  43066 KiB |    784 GiB |    784 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     868    |    1343    |   18126 K  |   18125 K  |
|       from large pool |      82    |     334    |    5063 K  |    5063 K  |
|       from small pool |     786    |    1235    |   13063 K  |   13062 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     868    |    1343    |   18126 K  |   18125 K  |
|       from large pool |      82    |     334    |    5063 K  |    5063 K  |
|       from small pool |     786    |    1235    |   13063 K  |   13062 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     266    |     266    |     266    |       0    |
|       from large pool |     231    |     231    |     231    |       0    |
|       from small pool |      35    |      35    |      35    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      42    |      91    |    7717 K  |    7717 K  |
|       from large pool |      19    |      25    |     975 K  |     975 K  |
|       from small pool |      23    |      73    |    6741 K  |    6741 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2712, |B|=0.1645, |∇A|=1.242e-05, |∇B|=1.544e-05, |LoRA(x)|=3.399e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2729, |B|=0.1675, |∇A|=6.177e-06, |∇B|=1.446e-05, |LoRA(x)|=3.671e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2637, |B|=0.1373, |∇A|=8.059e-06, |∇B|=2.125e-05, |LoRA(x)|=4.015e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.2545, |B|=0.142, |∇A|=1.518e-05, |∇B|=4.65e-05, |LoRA(x)|=2.158e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.2809, |B|=0.1687, |∇A|=2.046e-05, |∇B|=1.587e-05, |LoRA(x)|=8.516e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2528, |B|=0.1381, |∇A|=5.852e-06, |∇B|=4.107e-05, |LoRA(x)|=4.954e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2652, |B|=0.1512, |∇A|=1.248e-05, |∇B|=1.809e-05, |LoRA(x)|=2.561e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2559, |B|=0.1557, |∇A|=7.45e-06, |∇B|=1.748e-05, |LoRA(x)|=3.104e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2501, |B|=0.1275, |∇A|=1.187e-05, |∇B|=3.63e-05, |LoRA(x)|=3.725e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2552, |B|=0.1281, |∇A|=1.236e-05, |∇B|=5.365e-05, |LoRA(x)|=3.199e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2708, |B|=0.1609, |∇A|=1.571e-05, |∇B|=1.942e-05, |LoRA(x)|=9.91e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.2588, |B|=0.1337, |∇A|=9.242e-06, |∇B|=6.657e-05, |LoRA(x)|=2.476e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2595, |B|=0.1515, |∇A|=1.11e-05, |∇B|=2.483e-05, |LoRA(x)|=3.157e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2708, |B|=0.1635, |∇A|=1.243e-05, |∇B|=2.197e-05, |LoRA(x)|=3.55e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2464, |B|=0.1088, |∇A|=8.384e-06, |∇B|=5.13e-05, |LoRA(x)|=4.268e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2503, |B|=0.1339, |∇A|=2.356e-05, |∇B|=6.333e-05, |LoRA(x)|=1.632e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.283, |B|=0.1723, |∇A|=1.912e-05, |∇B|=2.311e-05, |LoRA(x)|=1.065e+05, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2517, |B|=0.131, |∇A|=1.063e-05, |∇B|=7.487e-05, |LoRA(x)|=3.153e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2595, |B|=0.1643, |∇A|=1.092e-05, |∇B|=2.535e-05, |LoRA(x)|=3.189e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2756, |B|=0.1721, |∇A|=2.031e-05, |∇B|=2.696e-05, |LoRA(x)|=3.227e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2468, |B|=0.1189, |∇A|=1.566e-05, |∇B|=4.28e-05, |LoRA(x)|=2.765e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.251, |B|=0.126, |∇A|=3.775e-05, |∇B|=6.053e-05, |LoRA(x)|=1.529e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2486, |B|=0.1463, |∇A|=2.647e-05, |∇B|=2.87e-05, |LoRA(x)|=6.15e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2441, |B|=0.1165, |∇A|=8.327e-06, |∇B|=7.226e-05, |LoRA(x)|=3.691e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.25, |B|=0.1638, |∇A|=1.185e-05, |∇B|=3.286e-05, |LoRA(x)|=3.044e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2585, |B|=0.1636, |∇A|=2.955e-05, |∇B|=2.798e-05, |LoRA(x)|=3.135e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2335, |B|=0.1084, |∇A|=7.934e-06, |∇B|=2.614e-05, |LoRA(x)|=3.104e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2523, |B|=0.1177, |∇A|=3.329e-05, |∇B|=5.98e-05, |LoRA(x)|=1.712e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.251, |B|=0.1328, |∇A|=1.922e-05, |∇B|=1.731e-05, |LoRA(x)|=6.409e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2312, |B|=0.102, |∇A|=1.664e-06, |∇B|=5.231e-05, |LoRA(x)|=9.2e+04, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2522, |B|=0.1596, |∇A|=6.446e-06, |∇B|=2.722e-05, |LoRA(x)|=5.28e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2534, |B|=0.1391, |∇A|=2.139e-05, |∇B|=1.514e-05, |LoRA(x)|=3.592e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2337, |B|=0.08995, |∇A|=4.012e-06, |∇B|=1.909e-05, |LoRA(x)|=3.959e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2354, |B|=0.09846, |∇A|=1.088e-05, |∇B|=6.078e-05, |LoRA(x)|=1.886e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2339, |B|=0.1149, |∇A|=8.083e-06, |∇B|=9.786e-06, |LoRA(x)|=5.218e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2258, |B|=0.09958, |∇A|=8.421e-07, |∇B|=4.586e-05, |LoRA(x)|=1.574e+05, B≠0=12288
In [7]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.2852030396461487
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.1868315190076828
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.28894203901290894
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19284531474113464
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.2786567807197571
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.15678970515727997
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26653891801834106
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16393844783306122
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3024210035800934
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.19977959990501404
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.266163170337677
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16271300613880157
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.28112292289733887
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17357999086380005
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.26969343423843384
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17819923162460327
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26163023710250854
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14616072177886963
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.26657772064208984
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14856967329978943
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2862350344657898
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1856018602848053
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2755448818206787
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15843509137630463
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.27186107635498047
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17322131991386414
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.285927951335907
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1866149604320526
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25501346588134766
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12291982769966125
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.2613106966018677
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15331488847732544
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.3022487163543701
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20070664584636688
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2643374800682068
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.1518675535917282
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2738485038280487
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1887315809726715
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29151996970176697
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.196702241897583
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.25722846388816833
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13603520393371582
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.260562539100647
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.1413680464029312
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.25821197032928467
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16667717695236206
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25490492582321167
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13568715751171112
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2612036466598511
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.186976820230484
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2723938226699829
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18866805732250214
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24070772528648376
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12169525027275085
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26227468252182007
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.1335488259792328
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2600998878479004
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15008249878883362
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.2391623854637146
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11444054543972015
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2612508535385132
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18396827578544617
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2618298828601837
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15817555785179138
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24128463864326477
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10050331801176071
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24433913826942444
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11103056371212006
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24154244363307953
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.13013333082199097
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23339377343654633
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11259599775075912
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.3497
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.3571
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 40.8116
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.0836
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.6761
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.3207
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.0872
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.2521
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.0952
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.6016
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.5906
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.2036
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.0001
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.6236
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.1599
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.0324
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.2031
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.7763
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.0907
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.1833
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.4288
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.5808
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.1650
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.6503
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.5464
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.4745
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.4875
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.2999
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.3883
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.6219
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.3501
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.8545
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.8315
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.1244
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.1303
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 21.8819
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 38.9474
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.5128
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.3343
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.6666
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.6672
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.5756
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.1686
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.1348
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.6448
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.2993
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.4788
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.4280
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.0708
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.4088
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.6041
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.7302
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.2148
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.3467
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.7528
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.2596
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.7303
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.6779
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.3745
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.6143
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.2654
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.0703
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.1617
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.8125
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.3778
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.4962
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.5389
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.0226
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.4220
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.0077
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.3166
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5074
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.065723419189453
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9603335857391357
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0488741397857666
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9554517269134521
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.924666166305542
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9596352577209473
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.8989819288253784
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9143486022949219
distilbert.transformer.layer.1.attention.q_lin.scale_out 1.9937138557434082
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9632105827331543
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9829158782958984
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.972657561302185
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9284312725067139
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9239709377288818
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9023430347442627
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9340471029281616
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0029258728027344
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9627807140350342
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.041708469390869
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9827865362167358
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9156968593597412
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.923585295677185
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.894546389579773
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9101351499557495
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.0140867233276367
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9457342624664307
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0678651332855225
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9433894157409668
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.932185411453247
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9020198583602905
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9203736782073975
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.919033408164978
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.0426063537597656
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9197235107421875
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.0591018199920654
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9308593273162842
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8353819847106934
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9478187561035156
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9222724437713623
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8530683517456055
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9434971809387207
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.901269793510437
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.065319061279297
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9098787307739258
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.9001773595809937
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9044647216796875
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8900084495544434
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9580339193344116
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.1630
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9237
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7412
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8128
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.0365
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9269
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.2719
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7150
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.0822
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0276
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.6968
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1462
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9471
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8715
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2138
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.2321
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.1507
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9045
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.3555
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.5349
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.5577
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8781
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.1729
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.5444
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.5551
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4495
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.1008
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5170
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.0951
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2702
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.8018
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7043
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.4434
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7162
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 57.9523
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0160
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.7578
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3412
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.1367
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.1710
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 54.9056
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.3334
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9395
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.4642
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.4265
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3221
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2530
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5957
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.3770124912261963
distilbert.transformer.layer.0.attention.q_lin.m_in 0.27724745869636536
distilbert.transformer.layer.0.attention.k_lin.m_out 0.37355756759643555
distilbert.transformer.layer.0.attention.k_lin.m_in 0.2839737832546234
distilbert.transformer.layer.0.attention.v_lin.m_out 0.2701805531978607
distilbert.transformer.layer.0.attention.v_lin.m_in 0.27773016691207886
distilbert.transformer.layer.0.attention.out_lin.m_out 0.24917708337306976
distilbert.transformer.layer.0.attention.out_lin.m_in 0.24767015874385834
distilbert.transformer.layer.1.attention.q_lin.m_out 0.3190837800502777
distilbert.transformer.layer.1.attention.q_lin.m_in 0.27320823073387146
distilbert.transformer.layer.1.attention.k_lin.m_out 0.3182787597179413
distilbert.transformer.layer.1.attention.k_lin.m_in 0.2656557559967041
distilbert.transformer.layer.1.attention.v_lin.m_out 0.25530683994293213
distilbert.transformer.layer.1.attention.v_lin.m_in 0.25207918882369995
distilbert.transformer.layer.1.attention.out_lin.m_out 0.24881324172019958
distilbert.transformer.layer.1.attention.out_lin.m_in 0.2508460581302643
distilbert.transformer.layer.2.attention.q_lin.m_out 0.3128194510936737
distilbert.transformer.layer.2.attention.q_lin.m_in 0.258196085691452
distilbert.transformer.layer.2.attention.k_lin.m_out 0.35070037841796875
distilbert.transformer.layer.2.attention.k_lin.m_in 0.2925625741481781
distilbert.transformer.layer.2.attention.v_lin.m_out 0.2368810623884201
distilbert.transformer.layer.2.attention.v_lin.m_in 0.24671325087547302
distilbert.transformer.layer.2.attention.out_lin.m_out 0.25679904222488403
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23690304160118103
distilbert.transformer.layer.3.attention.q_lin.m_out 0.3336006999015808
distilbert.transformer.layer.3.attention.q_lin.m_in 0.26909562945365906
distilbert.transformer.layer.3.attention.k_lin.m_out 0.3726501166820526
distilbert.transformer.layer.3.attention.k_lin.m_in 0.28335466980934143
distilbert.transformer.layer.3.attention.v_lin.m_out 0.2500506043434143
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23332524299621582
distilbert.transformer.layer.3.attention.out_lin.m_out 0.25282174348831177
distilbert.transformer.layer.3.attention.out_lin.m_in 0.24877238273620605
distilbert.transformer.layer.4.attention.q_lin.m_out 0.35002803802490234
distilbert.transformer.layer.4.attention.q_lin.m_in 0.24191546440124512
distilbert.transformer.layer.4.attention.k_lin.m_out 0.37940168380737305
distilbert.transformer.layer.4.attention.k_lin.m_in 0.25263217091560364
distilbert.transformer.layer.4.attention.v_lin.m_out 0.22971975803375244
distilbert.transformer.layer.4.attention.v_lin.m_in 0.24351906776428223
distilbert.transformer.layer.4.attention.out_lin.m_out 0.27787744998931885
distilbert.transformer.layer.4.attention.out_lin.m_in 0.2244793176651001
distilbert.transformer.layer.5.attention.q_lin.m_out 0.31838345527648926
distilbert.transformer.layer.5.attention.q_lin.m_in 0.24490538239479065
distilbert.transformer.layer.5.attention.k_lin.m_out 0.3695001006126404
distilbert.transformer.layer.5.attention.k_lin.m_in 0.24230703711509705
distilbert.transformer.layer.5.attention.v_lin.m_out 0.25600332021713257
distilbert.transformer.layer.5.attention.v_lin.m_in 0.22535942494869232
distilbert.transformer.layer.5.attention.out_lin.m_out 0.2510755956172943
distilbert.transformer.layer.5.attention.out_lin.m_in 0.22795787453651428
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.0632
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.2798
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.1138
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3424
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 9.8511
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.1935
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.2815
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7289
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.4923
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2457
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.2975
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.5561
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.1756
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6442
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.8119
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8755
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.0380
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.4882
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.1821
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.5475
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.7905
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6471
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.2760
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2415
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.6200
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.7229
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.7838
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.3737
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.0698
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.1958
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.0340
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.6839
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.5137
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.0286
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.1788
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.3409
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.5298
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.0859
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.1835
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.2817
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.6623
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.4598
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.5437
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.0907
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3540
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0294
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3183
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.3993
In [8]:
def set_all_lora_dropout(model, new_dropout_rate):
    for module in model.modules():
        if isinstance(module, LoRALayer):
            module.dropout.p = new_dropout_rate

def print_dropout_rates(model):
    for name, module in model.named_modules():
        if isinstance(module, LoRALayer):
            print(f"{name}.dropout.p = {module.dropout.p}")

def split_lora_dora_params(model):
    lora_A_params = []
    lora_B_params = []
    m_params = []
    scale_params = []

    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        if "lora.A" in name:
            lora_A_params.append(param)
        elif "lora.B" in name:
            lora_B_params.append(param)
        elif name.endswith("m_in") or name.endswith("m_out"):
            m_params.append(param)
        elif "scale" in name:
            scale_params.append(param)

    return {
        "lora_A": lora_A_params,
        "lora_B": lora_B_params,
        "m": m_params,
        "scale": scale_params,
    }

def create_custom_optimizer(model, base_lr=1e-4, lr_B_scale=10.0, lr_scale_params=0.2, weight_decay=0.01):
    param_groups = split_lora_dora_params(model)

    optimizer = torch.optim.AdamW([
        {"params": param_groups["lora_A"], "lr": base_lr},
        {"params": param_groups["lora_B"], "lr": base_lr * lr_B_scale},
        {"params": param_groups["m"], "lr": base_lr},
        {"params": param_groups["scale"], "lr": base_lr * lr_scale_params},
    ], weight_decay=weight_decay)

    return optimizer
In [9]:
# set dropout to 0.1 to avoid overheating lora.B channel
# but: avoid dropout = 0.0 when lora.B is already large enough: this leads to LoRA adaptors dying out
#likely due to unregularised overfitting and gradient collapse on low-magnitude params
set_all_lora_dropout(model_ddora_all_attn, 0.1)
print_dropout_rates(model_ddora_all_attn)

dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 3e-3 ###############



from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=2, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0,
    report_to="none",
    log_level="error"
)

trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)

trainer_ddora_all_attn.optimizer = create_custom_optimizer(
    trainer_ddora_all_attn.model,
    base_lr=3e-3, ###########
    lr_B_scale=0.5, #############
    lr_scale_params=0.75, #########
    weight_decay=1e-5,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

# Example output
for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'][name]:.4g}, |∇B|={agg['B_grad_mean'][name]:.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'][name]:.4g}, B≠0={agg['B_nonzero_count'][name]:.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
distilbert.transformer.layer.0.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.0.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.1.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.2.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.3.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.4.ffn.lin2.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.q_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.k_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.v_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.attention.out_lin.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin1.lora.dropout.p = 0.1
distilbert.transformer.layer.5.ffn.lin2.lora.dropout.p = 0.1
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[1564/1564 2:55:10, Epoch 2/2]
Step Training Loss Validation Loss Accuracy F1
50 0.171000 0.198666 0.928800 0.928650
100 0.152800 0.207403 0.928000 0.927965
150 0.170100 0.206528 0.922400 0.922406
200 0.111000 0.218847 0.927200 0.926929
250 0.139300 0.210196 0.928000 0.927940
300 0.142700 0.200031 0.929600 0.929578
350 0.144000 0.204206 0.924800 0.924776
400 0.117900 0.217714 0.930400 0.930310
450 0.125400 0.224378 0.924800 0.924822
500 0.148100 0.212716 0.926400 0.926400
550 0.125000 0.210286 0.928800 0.928650
600 0.129100 0.214049 0.929600 0.929589
650 0.102500 0.222723 0.927200 0.927171
700 0.102100 0.226605 0.928000 0.927927
750 0.111200 0.229735 0.925600 0.925606
800 0.124000 0.234266 0.919200 0.919298
850 0.098500 0.229973 0.926400 0.926400
900 0.085800 0.230217 0.924800 0.924650
950 0.090300 0.235953 0.928000 0.927927
1000 0.081900 0.242805 0.920800 0.920794
1050 0.096500 0.238420 0.926400 0.926237
1100 0.101600 0.233878 0.927200 0.927133
1150 0.066800 0.236318 0.929600 0.929529
1200 0.091300 0.236734 0.925600 0.925594
1250 0.077800 0.235389 0.928000 0.927953
1300 0.083900 0.239438 0.928800 0.928760
1350 0.089700 0.237017 0.927200 0.927092
1400 0.098800 0.233933 0.930400 0.930336
1450 0.072900 0.235942 0.927200 0.927194
1500 0.096200 0.238092 0.929600 0.929578
1550 0.171300 0.236920 0.928000 0.927965

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 559443 KiB |  12777 MiB | 327719 GiB | 327718 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 326500 GiB | 326500 GiB |
|       from small pool |  18003 KiB |     67 MiB |   1218 GiB |   1218 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 559443 KiB |  12777 MiB | 327719 GiB | 327718 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 326500 GiB | 326500 GiB |
|       from small pool |  18003 KiB |     67 MiB |   1218 GiB |   1218 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 557230 KiB |  12774 MiB | 327414 GiB | 327413 GiB |
|       from large pool | 539228 KiB |  12708 MiB | 326202 GiB | 326201 GiB |
|       from small pool |  18002 KiB |     67 MiB |   1212 GiB |   1212 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  13058 MiB |  13058 MiB |  25404 MiB |  12346 MiB |
|       from large pool |  12988 MiB |  12988 MiB |  25272 MiB |  12284 MiB |
|       from small pool |     70 MiB |     70 MiB |    132 MiB |     62 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 186029 KiB | 403075 KiB |  54058 GiB |  54058 GiB |
|       from large pool | 179456 KiB | 396800 KiB |  52747 GiB |  52747 GiB |
|       from small pool |   6573 KiB |  43066 KiB |   1311 GiB |   1311 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     860    |    1343    |   30218 K  |   30217 K  |
|       from large pool |      80    |     334    |    8442 K  |    8442 K  |
|       from small pool |     780    |    1235    |   21775 K  |   21775 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     860    |    1343    |   30218 K  |   30217 K  |
|       from large pool |      80    |     334    |    8442 K  |    8442 K  |
|       from small pool |     780    |    1235    |   21775 K  |   21775 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     263    |     266    |     503    |     240    |
|       from large pool |     228    |     231    |     437    |     209    |
|       from small pool |      35    |      35    |      66    |      31    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      39    |      91    |   13123 K  |   13123 K  |
|       from large pool |      19    |      25    |    1645 K  |    1645 K  |
|       from small pool |      20    |      73    |   11477 K  |   11477 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2923, |B|=0.1886, |∇A|=1.603e-05, |∇B|=1.49e-05, |LoRA(x)|=3.698e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.295, |B|=0.1946, |∇A|=8.035e-06, |∇B|=1.447e-05, |LoRA(x)|=4.196e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2844, |B|=0.1591, |∇A|=1.076e-05, |∇B|=1.945e-05, |LoRA(x)|=4.082e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.271, |B|=0.166, |∇A|=1.771e-05, |∇B|=3.743e-05, |LoRA(x)|=2.424e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.3087, |B|=0.2015, |∇A|=2.777e-05, |∇B|=1.693e-05, |LoRA(x)|=9.04e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2703, |B|=0.1648, |∇A|=8.989e-06, |∇B|=4.098e-05, |LoRA(x)|=4.44e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2863, |B|=0.1758, |∇A|=1.646e-05, |∇B|=1.71e-05, |LoRA(x)|=2.87e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2757, |B|=0.1802, |∇A|=9.736e-06, |∇B|=1.758e-05, |LoRA(x)|=3.28e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2666, |B|=0.1485, |∇A|=1.34e-05, |∇B|=2.776e-05, |LoRA(x)|=4.203e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2704, |B|=0.1508, |∇A|=1.454e-05, |∇B|=3.814e-05, |LoRA(x)|=3.212e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2926, |B|=0.1876, |∇A|=2.232e-05, |∇B|=2.009e-05, |LoRA(x)|=9.228e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.2796, |B|=0.1607, |∇A|=1.426e-05, |∇B|=6.012e-05, |LoRA(x)|=2.112e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2781, |B|=0.1749, |∇A|=1.316e-05, |∇B|=2.128e-05, |LoRA(x)|=3.467e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2924, |B|=0.1885, |∇A|=1.639e-05, |∇B|=2.285e-05, |LoRA(x)|=3.962e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2582, |B|=0.1257, |∇A|=7.738e-06, |∇B|=2.115e-05, |LoRA(x)|=5.115e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2642, |B|=0.1559, |∇A|=2.69e-05, |∇B|=4.541e-05, |LoRA(x)|=1.683e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.3067, |B|=0.2024, |∇A|=2.343e-05, |∇B|=2.157e-05, |LoRA(x)|=1.145e+05, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2683, |B|=0.1544, |∇A|=1.34e-05, |∇B|=5.394e-05, |LoRA(x)|=3.488e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2803, |B|=0.1901, |∇A|=1.35e-05, |∇B|=2.353e-05, |LoRA(x)|=3.44e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2952, |B|=0.1982, |∇A|=2.459e-05, |∇B|=2.6e-05, |LoRA(x)|=3.69e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2612, |B|=0.1387, |∇A|=1.571e-05, |∇B|=2.234e-05, |LoRA(x)|=3.344e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2658, |B|=0.1439, |∇A|=3.732e-05, |∇B|=3.537e-05, |LoRA(x)|=1.835e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2639, |B|=0.1687, |∇A|=2.518e-05, |∇B|=1.91e-05, |LoRA(x)|=6.809e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2576, |B|=0.139, |∇A|=1.165e-05, |∇B|=4.748e-05, |LoRA(x)|=3.457e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.266, |B|=0.1887, |∇A|=1.028e-05, |∇B|=2.133e-05, |LoRA(x)|=3.756e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2772, |B|=0.1906, |∇A|=2.782e-05, |∇B|=2.17e-05, |LoRA(x)|=3.853e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2456, |B|=0.1248, |∇A|=7.548e-06, |∇B|=1.231e-05, |LoRA(x)|=3.302e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.266, |B|=0.1366, |∇A|=3.234e-05, |∇B|=2.639e-05, |LoRA(x)|=2.125e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.264, |B|=0.1525, |∇A|=1.45e-05, |∇B|=8.94e-06, |LoRA(x)|=8.223e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2397, |B|=0.1167, |∇A|=1.681e-06, |∇B|=1.305e-05, |LoRA(x)|=9.592e+04, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2693, |B|=0.186, |∇A|=6.757e-06, |∇B|=1.821e-05, |LoRA(x)|=6e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2672, |B|=0.1606, |∇A|=2.107e-05, |∇B|=1.107e-05, |LoRA(x)|=4.379e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2468, |B|=0.1033, |∇A|=7.268e-06, |∇B|=8.619e-06, |LoRA(x)|=3.511e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2494, |B|=0.1147, |∇A|=1.182e-05, |∇B|=1.262e-05, |LoRA(x)|=2.164e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2487, |B|=0.1329, |∇A|=7.639e-06, |∇B|=3.873e-06, |LoRA(x)|=6.733e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2379, |B|=0.1144, |∇A|=2.809e-07, |∇B|=5.887e-06, |LoRA(x)|=2.086e+05, B≠0=12288
In [10]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.28593704104423523
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18714945018291473
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.28985413908958435
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1930702030658722
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.27944302558898926
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.15703189373016357
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26731884479522705
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16420790553092957
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3031120300292969
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.20004507899284363
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26663610339164734
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16295364499092102
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2813974618911743
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17370526492595673
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2702440321445465
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17850112915039062
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2619485855102539
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.1464267075061798
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2668524980545044
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14889678359031677
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2867199778556824
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.18584245443344116
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.2757873833179474
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15874330699443817
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.27296027541160583
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.1735691875219345
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28685471415519714
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1868443489074707
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.255046546459198
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12340390682220459
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26136600971221924
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15360134840011597
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.302565336227417
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.2009706199169159
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2644515633583069
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15223130583763123
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.27478688955307007
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.18895143270492554
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2912011742591858
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19694802165031433
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2578084468841553
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13630230724811554
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2611168622970581
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14164264500141144
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.2590485215187073
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16692781448364258
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.254573792219162
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13625115156173706
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.26176127791404724
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.187218576669693
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2729148268699646
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.1890524923801422
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.24128566682338715
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12205924838781357
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26241645216941833
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13388215005397797
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.26028382778167725
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15029317140579224
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23894798755645752
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.1147853285074234
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26214921474456787
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18426935374736786
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2623964548110962
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15851348638534546
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2414218783378601
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10067234933376312
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24519243836402893
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11163420975208282
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24182814359664917
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1305427998304367
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23493000864982605
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11285355687141418
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.4405
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.3895
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 40.9122
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1172
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.7739
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.3608
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.1675
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.2920
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.1837
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.6638
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.7346
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.2397
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.0452
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.6523
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.2249
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.0732
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.2578
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.8114
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1229
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.2260
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.4913
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.6494
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.2518
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.6957
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.6635
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5085
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.5898
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.3329
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4050
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.6652
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.3746
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.8932
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.8806
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.1776
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.1942
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 21.9220
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.0540
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.5379
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.3258
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7049
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.7462
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.6216
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.2570
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.1840
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.7492
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.3749
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.4350
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.4855
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.1394
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.4493
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.6975
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.7736
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.3077
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.4064
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.7813
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.2943
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.7713
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.7477
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.3355
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.6588
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.3640
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1069
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.2713
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.8616
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.4147
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.5707
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6305
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.0943
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.4835
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.1080
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7199
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5415
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.0673470497131348
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.959618330001831
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0504937171936035
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9555171728134155
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.9283709526062012
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9596271514892578
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9006755352020264
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9129583835601807
distilbert.transformer.layer.1.attention.q_lin.scale_out 1.9956352710723877
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9624356031417847
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.984641432762146
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9718208312988281
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9293396472930908
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.922268271446228
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9018371105194092
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9323279857635498
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0033369064331055
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9628219604492188
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0424909591674805
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.9834234714508057
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.915880560874939
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9204366207122803
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.8952863216400146
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9077247381210327
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.016188144683838
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9464267492294312
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0708353519439697
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9410731792449951
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9289363622665405
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9007830619812012
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9172041416168213
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9178813695907593
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.042696714401245
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9188411235809326
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.06121826171875
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9304380416870117
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8320510387420654
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.946484088897705
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9181432723999023
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8494853973388672
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.9441440105438232
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.900758147239685
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.066316843032837
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9095886945724487
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.896740198135376
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9020140171051025
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.887932300567627
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9566638469696045
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.2229
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9106
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 57.7934
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8139
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.1541
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9282
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.3326
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.6839
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.1401
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0107
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.7442
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1282
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 53.9817
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8315
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2108
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1900
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.1699
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9118
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.3825
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.5532
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.5693
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8041
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.1984
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4889
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.6185
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.4670
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.1903
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.4578
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.0061
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2434
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.7205
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.6783
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.4506
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7003
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.0178
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0089
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6836
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3096
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0278
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0867
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 54.9359
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.3308
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 57.9806
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.4626
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3404
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.2618
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2070
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5645
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.38056397438049316
distilbert.transformer.layer.0.attention.q_lin.m_in 0.27801811695098877
distilbert.transformer.layer.0.attention.k_lin.m_out 0.37722188234329224
distilbert.transformer.layer.0.attention.k_lin.m_in 0.2856902778148651
distilbert.transformer.layer.0.attention.v_lin.m_out 0.2773345112800598
distilbert.transformer.layer.0.attention.v_lin.m_in 0.2796667814254761
distilbert.transformer.layer.0.attention.out_lin.m_out 0.2537873089313507
distilbert.transformer.layer.0.attention.out_lin.m_in 0.2489577978849411
distilbert.transformer.layer.1.attention.q_lin.m_out 0.323003351688385
distilbert.transformer.layer.1.attention.q_lin.m_in 0.2741820216178894
distilbert.transformer.layer.1.attention.k_lin.m_out 0.32190388441085815
distilbert.transformer.layer.1.attention.k_lin.m_in 0.26650986075401306
distilbert.transformer.layer.1.attention.v_lin.m_out 0.25884923338890076
distilbert.transformer.layer.1.attention.v_lin.m_in 0.25241196155548096
distilbert.transformer.layer.1.attention.out_lin.m_out 0.25029632449150085
distilbert.transformer.layer.1.attention.out_lin.m_in 0.2511628568172455
distilbert.transformer.layer.2.attention.q_lin.m_out 0.3147231936454773
distilbert.transformer.layer.2.attention.q_lin.m_in 0.26011890172958374
distilbert.transformer.layer.2.attention.k_lin.m_out 0.3528413772583008
distilbert.transformer.layer.2.attention.k_lin.m_in 0.29503440856933594
distilbert.transformer.layer.2.attention.v_lin.m_out 0.23936259746551514
distilbert.transformer.layer.2.attention.v_lin.m_in 0.2455853670835495
distilbert.transformer.layer.2.attention.out_lin.m_out 0.25981783866882324
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23666052520275116
distilbert.transformer.layer.3.attention.q_lin.m_out 0.3375234305858612
distilbert.transformer.layer.3.attention.q_lin.m_in 0.2719265818595886
distilbert.transformer.layer.3.attention.k_lin.m_out 0.37764236330986023
distilbert.transformer.layer.3.attention.k_lin.m_in 0.281810462474823
distilbert.transformer.layer.3.attention.v_lin.m_out 0.24766501784324646
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23446033895015717
distilbert.transformer.layer.3.attention.out_lin.m_out 0.2505345344543457
distilbert.transformer.layer.3.attention.out_lin.m_in 0.2496027946472168
distilbert.transformer.layer.4.attention.q_lin.m_out 0.3517087697982788
distilbert.transformer.layer.4.attention.q_lin.m_in 0.2426379770040512
distilbert.transformer.layer.4.attention.k_lin.m_out 0.3833864629268646
distilbert.transformer.layer.4.attention.k_lin.m_in 0.2542974054813385
distilbert.transformer.layer.4.attention.v_lin.m_out 0.22918450832366943
distilbert.transformer.layer.4.attention.v_lin.m_in 0.24464242160320282
distilbert.transformer.layer.4.attention.out_lin.m_out 0.2750687599182129
distilbert.transformer.layer.4.attention.out_lin.m_in 0.2238387167453766
distilbert.transformer.layer.5.attention.q_lin.m_out 0.32097992300987244
distilbert.transformer.layer.5.attention.q_lin.m_in 0.2463197410106659
distilbert.transformer.layer.5.attention.k_lin.m_out 0.37192124128341675
distilbert.transformer.layer.5.attention.k_lin.m_in 0.244779571890831
distilbert.transformer.layer.5.attention.v_lin.m_out 0.2550698518753052
distilbert.transformer.layer.5.attention.v_lin.m_in 0.22488240897655487
distilbert.transformer.layer.5.attention.out_lin.m_out 0.25121861696243286
distilbert.transformer.layer.5.attention.out_lin.m_in 0.2287808656692505
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.2019
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.2984
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.2189
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3540
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.0525
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2157
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.4275
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7476
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.5930
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2563
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.3709
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.5824
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.2795
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6517
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.8824
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8791
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.1133
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.5387
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.2395
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.5863
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.8512
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6610
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.3322
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2583
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.7188
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.7568
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 12.9247
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.3524
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 8.9878
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2346
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 8.9857
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.6994
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.5542
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.0776
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.2873
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.3746
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.5224
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1099
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0823
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.2891
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.7517
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5089
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.6481
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.1404
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3037
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0170
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3243
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4163

Freeze LoRA parameters for layer 5 FFN¶

In [11]:
# Freeze LoRA parameters for layer.5.ffn.lin2
for name, param in trainer_ddora_all_attn.model.named_parameters():#
    if "transformer.layer.5.ffn.lin2" in name and "lora" in name:
        param.requires_grad = False
        print(f"FROZEN: {name}")

dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############


from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=1, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0,
    report_to="none",
    log_level="error"
)

trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)

trainer_ddora_all_attn.optimizer = create_custom_optimizer(
    trainer_ddora_all_attn.model,
    base_lr=1e-3, ###########
    lr_B_scale=1.0, #############
    lr_scale_params=1.0, #########
    weight_decay=1e-5,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
FROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.A
FROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.B
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[782/782 1:20:33, Epoch 1/1]
Step Training Loss Validation Loss Accuracy F1
50 0.117500 0.208835 0.930400 0.930336
100 0.150800 0.206763 0.928000 0.927940
150 0.162700 0.207248 0.924000 0.924006
200 0.109600 0.207848 0.930400 0.930310
250 0.136800 0.213334 0.927200 0.927227
300 0.141400 0.206167 0.928800 0.928735
350 0.143200 0.206361 0.927200 0.927194
400 0.114100 0.212035 0.925600 0.925656
450 0.127800 0.209920 0.928800 0.928772
500 0.142900 0.205849 0.930400 0.930323
550 0.126100 0.205841 0.932000 0.931912
600 0.128800 0.207515 0.928000 0.927965
650 0.100400 0.207170 0.930400 0.930310
700 0.102100 0.209562 0.930400 0.930336
750 0.107500 0.210749 0.930400 0.930336

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 558963 KiB |  12777 MiB | 392916 GiB | 392915 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 391456 GiB | 391455 GiB |
|       from small pool |  17523 KiB |     67 MiB |   1459 GiB |   1459 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 558963 KiB |  12777 MiB | 392916 GiB | 392915 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 391456 GiB | 391455 GiB |
|       from small pool |  17523 KiB |     67 MiB |   1459 GiB |   1459 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 556750 KiB |  12774 MiB | 392587 GiB | 392586 GiB |
|       from large pool | 539228 KiB |  12708 MiB | 391134 GiB | 391134 GiB |
|       from small pool |  17522 KiB |     67 MiB |   1452 GiB |   1452 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  12940 MiB |  13058 MiB |  37632 MiB |  24692 MiB |
|       from large pool |  12872 MiB |  12988 MiB |  37440 MiB |  24568 MiB |
|       from small pool |     68 MiB |     70 MiB |    192 MiB |    124 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 186509 KiB | 403075 KiB |  69510 GiB |  69510 GiB |
|       from large pool | 179456 KiB | 396800 KiB |  67937 GiB |  67937 GiB |
|       from small pool |   7053 KiB |  43066 KiB |   1572 GiB |   1572 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     856    |    1343    |   36234 K  |   36233 K  |
|       from large pool |      80    |     334    |   10124 K  |   10124 K  |
|       from small pool |     776    |    1235    |   26109 K  |   26109 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     856    |    1343    |   36234 K  |   36233 K  |
|       from large pool |      80    |     334    |   10124 K  |   10124 K  |
|       from small pool |     776    |    1235    |   26109 K  |   26109 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     259    |     266    |     736    |     477    |
|       from large pool |     225    |     231    |     640    |     415    |
|       from small pool |      34    |      35    |      96    |      62    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      37    |      91    |   15948 K  |   15948 K  |
|       from large pool |      19    |      25    |    2009 K  |    2009 K  |
|       from small pool |      18    |      73    |   13938 K  |   13938 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2869, |B|=0.1876, |∇A|=1.333e-05, |∇B|=1.196e-05, |LoRA(x)|=3.76e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2906, |B|=0.1935, |∇A|=6.433e-06, |∇B|=1.195e-05, |LoRA(x)|=4.247e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2801, |B|=0.1574, |∇A|=8.705e-06, |∇B|=1.581e-05, |LoRA(x)|=4.329e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.268, |B|=0.1646, |∇A|=1.558e-05, |∇B|=3.117e-05, |LoRA(x)|=2.49e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.304, |B|=0.2005, |∇A|=2.365e-05, |∇B|=1.405e-05, |LoRA(x)|=9.32e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2672, |B|=0.1635, |∇A|=8.24e-06, |∇B|=3.471e-05, |LoRA(x)|=4.23e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2818, |B|=0.1742, |∇A|=1.286e-05, |∇B|=1.379e-05, |LoRA(x)|=3.027e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.271, |B|=0.1789, |∇A|=7.098e-06, |∇B|=1.336e-05, |LoRA(x)|=3.711e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2626, |B|=0.1468, |∇A|=1.056e-05, |∇B|=2.244e-05, |LoRA(x)|=4.453e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.267, |B|=0.1492, |∇A|=1.1e-05, |∇B|=2.913e-05, |LoRA(x)|=3.652e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2876, |B|=0.1862, |∇A|=1.606e-05, |∇B|=1.513e-05, |LoRA(x)|=1.066e+05, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.2764, |B|=0.1592, |∇A|=1.147e-05, |∇B|=4.788e-05, |LoRA(x)|=2.209e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2737, |B|=0.1739, |∇A|=1.058e-05, |∇B|=1.74e-05, |LoRA(x)|=3.618e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2877, |B|=0.1872, |∇A|=1.217e-05, |∇B|=1.751e-05, |LoRA(x)|=4.144e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2554, |B|=0.1239, |∇A|=6.755e-06, |∇B|=1.772e-05, |LoRA(x)|=5.225e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.2617, |B|=0.1543, |∇A|=2.443e-05, |∇B|=3.976e-05, |LoRA(x)|=1.704e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.3031, |B|=0.2014, |∇A|=1.933e-05, |∇B|=1.793e-05, |LoRA(x)|=1.211e+05, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2651, |B|=0.1528, |∇A|=1.109e-05, |∇B|=4.721e-05, |LoRA(x)|=3.653e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2756, |B|=0.1892, |∇A|=1.127e-05, |∇B|=1.954e-05, |LoRA(x)|=3.524e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2917, |B|=0.1971, |∇A|=2.061e-05, |∇B|=2.185e-05, |LoRA(x)|=3.752e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.2585, |B|=0.1368, |∇A|=1.276e-05, |∇B|=1.827e-05, |LoRA(x)|=3.405e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2618, |B|=0.1422, |∇A|=3.202e-05, |∇B|=2.984e-05, |LoRA(x)|=1.839e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2599, |B|=0.1674, |∇A|=2.259e-05, |∇B|=1.615e-05, |LoRA(x)|=7.004e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2549, |B|=0.137, |∇A|=9.369e-06, |∇B|=4.113e-05, |LoRA(x)|=3.733e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.2624, |B|=0.1878, |∇A|=9.001e-06, |∇B|=1.755e-05, |LoRA(x)|=4.085e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2735, |B|=0.1897, |∇A|=2.599e-05, |∇B|=1.944e-05, |LoRA(x)|=3.824e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2421, |B|=0.1227, |∇A|=8.159e-06, |∇B|=1.143e-05, |LoRA(x)|=3.423e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2631, |B|=0.1349, |∇A|=3.188e-05, |∇B|=2.495e-05, |LoRA(x)|=2.062e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.2609, |B|=0.1508, |∇A|=1.334e-05, |∇B|=7.769e-06, |LoRA(x)|=8.132e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2387, |B|=0.1151, |∇A|=1.079e-06, |∇B|=1.035e-05, |LoRA(x)|=1.112e+05, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2637, |B|=0.1848, |∇A|=5.53e-06, |∇B|=1.5e-05, |LoRA(x)|=6.21e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2632, |B|=0.1592, |∇A|=1.749e-05, |∇B|=8.815e-06, |LoRA(x)|=4.364e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.242, |B|=0.1006, |∇A|=5.803e-06, |∇B|=6.981e-06, |LoRA(x)|=3.932e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2455, |B|=0.1123, |∇A|=1.09e-05, |∇B|=1.266e-05, |LoRA(x)|=2.144e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.2427, |B|=0.131, |∇A|=5.69e-06, |∇B|=2.801e-06, |LoRA(x)|=6.67e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2349, |B|=0.1129, |∇A|=0, |∇B|=0, |LoRA(x)|=2.068e+05, B≠0=12288
In [12]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.28707337379455566
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18769995868206024
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2907640337944031
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.1936057060956955
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.28023409843444824
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574789583683014
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.26817581057548523
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.16465796530246735
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.30428409576416016
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.20058000087738037
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.2674410343170166
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16359667479991913
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2819472551345825
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17439055442810059
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2710683047771454
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17898575961589813
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.2627217173576355
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.1469723880290985
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2670312821865082
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14931659400463104
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.2877423167228699
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.18635958433151245
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27652254700660706
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15932457149028778
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.273947536945343
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.1739175021648407
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28790760040283203
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.18734395503997803
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25540855526924133
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12403412163257599
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26176193356513977
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15451809763908386
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.3031601309776306
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20152036845684052
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2652890384197235
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15302824974060059
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2757566273212433
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.18924658000469208
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.2918875217437744
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718694686889648
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.25860539078712463
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704372942447662
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.26196131110191345
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226996898651123
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.25997695326805115
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748486459255219
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25504302978515625
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720043003559113
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.26243939995765686
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.1878441572189331
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.273539662361145
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.189828559756279
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2422509640455246
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12290792167186737
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.26336467266082764
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13514696061611176
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.26100867986679077
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097534656524658
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23862293362617493
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11521776020526886
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.2639756500720978
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.18488191068172455
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.2634556293487549
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.15944841504096985
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.2421378344297409
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.10063609480857849
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.2455993890762329
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11262671649456024
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.24293923377990723
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.13117605447769165
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.23493000864982605
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11285355687141418
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.5814
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.4466
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 41.0152
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1966
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.8526
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.4586
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.2699
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.3672
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.3197
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.7974
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.9530
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.3197
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.1291
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.7248
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.3195
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.1494
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.3453
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.9148
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1472
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.3194
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.6311
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.8018
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.4618
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.7955
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.7868
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5725
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.7129
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.4076
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4647
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.7767
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.4144
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 21.9998
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 42.9848
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.3165
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.4062
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 22.0261
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.1650
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.6011
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.4144
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7537
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.8473
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.7296
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.3618
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.2695
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.8514
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.5482
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.5448
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.6095
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.2356
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.5284
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.7939
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.8519
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.4178
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.5407
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.8966
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.4117
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.8628
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.9398
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.2699
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.7667
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.5402
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1881
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.4260
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.9560
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.5026
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.7060
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6724
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.2978
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.6172
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.2762
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7199
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5415
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.076575756072998
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9624719619750977
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.059420585632324
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9576948881149292
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.935288429260254
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.9621213674545288
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9077293872833252
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9148902893066406
distilbert.transformer.layer.1.attention.q_lin.scale_out 2.003368377685547
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9634246826171875
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9907522201538086
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.9740617275238037
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9336799383163452
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9234355688095093
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9045276641845703
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9323878288269043
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.010300636291504
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9660112857818604
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.053152322769165
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.986611247062683
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.9197368621826172
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9203671216964722
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.9001054763793945
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.9074167013168335
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.021730422973633
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.949169397354126
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.077785015106201
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.9428112506866455
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.933730125427246
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9019935131072998
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9261023998260498
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.9190481901168823
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.045186758041382
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9197725057601929
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.065519094467163
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9318139553070068
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.8299648761749268
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9480531215667725
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.917306900024414
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.84946870803833
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.948282241821289
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9058736562728882
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.071871519088745
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9120490550994873
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.896397352218628
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.9039411544799805
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8885447978973389
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9554933309555054
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.4880
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 54.9874
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 58.0535
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8672
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3577
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 54.9888
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.5378
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7379
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.3696
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0367
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.9233
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.1861
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 54.1079
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8645
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.2956
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1854
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.3724
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 54.9977
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.6940
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.6386
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.6841
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8057
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.3392
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4810
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.7827
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.5376
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.3907
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5073
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.1504
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.2825
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 53.9838
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7123
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5195
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7273
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.1384
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0466
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6293
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3511
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 53.9963
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0921
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.0667
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.4637
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 58.1353
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.5325
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3339
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3115
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2332
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5357
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.39016568660736084
distilbert.transformer.layer.0.attention.q_lin.m_in 0.2815612256526947
distilbert.transformer.layer.0.attention.k_lin.m_out 0.3865947425365448
distilbert.transformer.layer.0.attention.k_lin.m_in 0.28855225443840027
distilbert.transformer.layer.0.attention.v_lin.m_out 0.28526735305786133
distilbert.transformer.layer.0.attention.v_lin.m_in 0.28309836983680725
distilbert.transformer.layer.0.attention.out_lin.m_out 0.26175251603126526
distilbert.transformer.layer.0.attention.out_lin.m_in 0.2526334524154663
distilbert.transformer.layer.1.attention.q_lin.m_out 0.33136361837387085
distilbert.transformer.layer.1.attention.q_lin.m_in 0.2759714126586914
distilbert.transformer.layer.1.attention.k_lin.m_out 0.32868993282318115
distilbert.transformer.layer.1.attention.k_lin.m_in 0.26953765749931335
distilbert.transformer.layer.1.attention.v_lin.m_out 0.26419997215270996
distilbert.transformer.layer.1.attention.v_lin.m_in 0.2546566128730774
distilbert.transformer.layer.1.attention.out_lin.m_out 0.2542833983898163
distilbert.transformer.layer.1.attention.out_lin.m_in 0.25218233466148376
distilbert.transformer.layer.2.attention.q_lin.m_out 0.3223288655281067
distilbert.transformer.layer.2.attention.q_lin.m_in 0.2641069293022156
distilbert.transformer.layer.2.attention.k_lin.m_out 0.363922119140625
distilbert.transformer.layer.2.attention.k_lin.m_in 0.2987942099571228
distilbert.transformer.layer.2.attention.v_lin.m_out 0.24425539374351501
distilbert.transformer.layer.2.attention.v_lin.m_in 0.24729464948177338
distilbert.transformer.layer.2.attention.out_lin.m_out 0.2655073404312134
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23778128623962402
distilbert.transformer.layer.3.attention.q_lin.m_out 0.343430757522583
distilbert.transformer.layer.3.attention.q_lin.m_in 0.2754462957382202
distilbert.transformer.layer.3.attention.k_lin.m_out 0.3848956525325775
distilbert.transformer.layer.3.attention.k_lin.m_in 0.28433045744895935
distilbert.transformer.layer.3.attention.v_lin.m_out 0.2534070909023285
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23691999912261963
distilbert.transformer.layer.3.attention.out_lin.m_out 0.26040372252464294
distilbert.transformer.layer.3.attention.out_lin.m_in 0.2518913149833679
distilbert.transformer.layer.4.attention.q_lin.m_out 0.3547815680503845
distilbert.transformer.layer.4.attention.q_lin.m_in 0.24450191855430603
distilbert.transformer.layer.4.attention.k_lin.m_out 0.388192355632782
distilbert.transformer.layer.4.attention.k_lin.m_in 0.2566456198692322
distilbert.transformer.layer.4.attention.v_lin.m_out 0.22937855124473572
distilbert.transformer.layer.4.attention.v_lin.m_in 0.24749258160591125
distilbert.transformer.layer.4.attention.out_lin.m_out 0.2755943536758423
distilbert.transformer.layer.4.attention.out_lin.m_in 0.2263449877500534
distilbert.transformer.layer.5.attention.q_lin.m_out 0.32583218812942505
distilbert.transformer.layer.5.attention.q_lin.m_in 0.2522851228713989
distilbert.transformer.layer.5.attention.k_lin.m_out 0.378021240234375
distilbert.transformer.layer.5.attention.k_lin.m_in 0.2485998570919037
distilbert.transformer.layer.5.attention.v_lin.m_out 0.2562413215637207
distilbert.transformer.layer.5.attention.v_lin.m_in 0.22845476865768433
distilbert.transformer.layer.5.attention.out_lin.m_out 0.2536100745201111
distilbert.transformer.layer.5.attention.out_lin.m_in 0.2289573848247528
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.4592
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.3601
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.4781
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3834
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.2648
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2542
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.6388
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7940
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.8276
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2812
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.5565
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.6223
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.4099
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.6938
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 8.9958
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8709
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.3210
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.6061
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.5516
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.6418
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 8.9932
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6817
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.4676
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2764
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.8944
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.8136
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 13.1322
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.4065
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.1533
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2861
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.2659
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.7452
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.6204
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.1060
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.4002
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.4113
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4958
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1393
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0271
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.3287
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.9104
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5670
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.7845
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.2074
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3198
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0539
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.3842
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4135

Un-Freeze LoRA parameters for layer 5 FFN¶

In [13]:
# Un-freeze LoRA parameters for layer.5.ffn.lin2
for name, param in trainer_ddora_all_attn.model.named_parameters():
    if "transformer.layer.5.ffn.lin2" in name and "lora" in name:
        param.requires_grad = True
        print(f"UNFROZEN: {name}")


dropout = 0.1 #################
lora_rank = 16
lora_alpha = 128
weight_decay = 1e-5
scaling_factor=2.0
batch_size = 32
learning_rate = 1e-4 ###############


from transformers import TrainingArguments
eval_steps = 50
logging_steps = 50
output_dir_prefix = "finetuned-imdb-"

training_args_ddora_all_attn = TrainingArguments(
    output_dir=f"{output_dir_prefix}lora-all-attn",
    num_train_epochs=1, 
    #max_steps=200,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
    push_to_hub=False,
    max_grad_norm=1.0,
    report_to="none",
    log_level="error"
)

trainer_ddora_all_attn = Trainer(
    model=model_ddora_all_attn,
    args=training_args_ddora_all_attn,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_metrics,
)

trainer_ddora_all_attn.optimizer = create_custom_optimizer(
    trainer_ddora_all_attn.model,
    base_lr=5e-4, ###########
    lr_B_scale=0.5, #############
    lr_scale_params=1.0, #########
    weight_decay=1e-5,
)


hooks1, monitor1 = monitor_lora_parameters(trainer_ddora_all_attn.model)
#hooks2, gradient_history2 = monitor_gradients(trainer_ddora_all_attn.model)

#Train!
trainer_ddora_all_attn.train()
print (torch.cuda.memory_summary())

#for hook in hooks2:
#    hook.remove()

#for name, grads in gradient_history2.items():
#    print(f"{name}: Mean grad norm = {np.mean(grads):.6f}, Max = {np.max(grads):.6f}")

for hook in hooks1:
    hook.remove()
# Aggregate/log after training
from collections import defaultdict
agg = defaultdict(list)
for key, vals in monitor1.items():
    grouped = defaultdict(list)
    for name, val in vals:
        grouped[name].append(val)
    agg[key] = {name: sum(vs)/len(vs) for name, vs in grouped.items()}

for name in agg["A_abs_mean"]:
    print(f"{name}: |A|={agg['A_abs_mean'][name]:.4g}, |B|={agg['B_abs_mean'][name]:.4g}, "
          f"|∇A|={agg['A_grad_mean'].get(name, 0.0):.4g}, |∇B|={agg['B_grad_mean'].get(name, 0.0):.4g}, "
          f"|LoRA(x)|={agg['lora_output_norm'].get(name, 0.0):.4g}, B≠0={agg['B_nonzero_count'].get(name, 0):.0f}")

#eval_results_ddora_all_attn = trainer_ddora_all_attn.evaluate(dataset_encoded["test"])
#print(f"DDoRA (All Attention) Test Results: {eval_results_ddora_all_attn}")
UNFROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.A
UNFROZEN: distilbert.transformer.layer.5.ffn.lin2.lora.B
C:\Users\alexa\miniconda3\envs\grpo_env\lib\site-packages\transformers\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[782/782 1:41:38, Epoch 1/1]
Step Training Loss Validation Loss Accuracy F1
50 0.090400 0.213297 0.932000 0.931973
100 0.115900 0.215432 0.928800 0.928783
150 0.126300 0.215974 0.928800 0.928772
200 0.082900 0.218342 0.928800 0.928735
250 0.111600 0.221422 0.928000 0.928021
300 0.119800 0.219625 0.928000 0.927965
350 0.123500 0.220197 0.925600 0.925617
400 0.092500 0.223016 0.926400 0.926442
450 0.112200 0.219613 0.929600 0.929554
500 0.125800 0.216974 0.929600 0.929529
550 0.114600 0.216819 0.928800 0.928722
600 0.125400 0.215816 0.928800 0.928772
650 0.096700 0.215295 0.932000 0.931950
700 0.098400 0.216751 0.929600 0.929566
750 0.105500 0.217510 0.928800 0.928760

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 559443 KiB |  12777 MiB | 458150 GiB | 458149 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 456448 GiB | 456447 GiB |
|       from small pool |  18003 KiB |     67 MiB |   1702 GiB |   1702 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 559443 KiB |  12777 MiB | 458150 GiB | 458149 GiB |
|       from large pool | 541440 KiB |  12712 MiB | 456448 GiB | 456447 GiB |
|       from small pool |  18003 KiB |     67 MiB |   1702 GiB |   1702 GiB |
|---------------------------------------------------------------------------|
| Requested memory      | 557230 KiB |  12774 MiB | 457760 GiB | 457760 GiB |
|       from large pool | 539228 KiB |  12708 MiB | 456067 GiB | 456066 GiB |
|       from small pool |  18002 KiB |     67 MiB |   1693 GiB |   1693 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  13058 MiB |  13058 MiB |  49978 MiB |  36920 MiB |
|       from large pool |  12988 MiB |  12988 MiB |  49724 MiB |  36736 MiB |
|       from small pool |     70 MiB |     70 MiB |    254 MiB |    184 MiB |
|---------------------------------------------------------------------------|
| Non-releasable memory | 186029 KiB | 403075 KiB |  80605 GiB |  80605 GiB |
|       from large pool | 179456 KiB | 396800 KiB |  78771 GiB |  78771 GiB |
|       from small pool |   6573 KiB |  43066 KiB |   1833 GiB |   1833 GiB |
|---------------------------------------------------------------------------|
| Allocations           |     860    |    1343    |   42260 K  |   42259 K  |
|       from large pool |      80    |     334    |   11805 K  |   11805 K  |
|       from small pool |     780    |    1235    |   30454 K  |   30454 K  |
|---------------------------------------------------------------------------|
| Active allocs         |     860    |    1343    |   42260 K  |   42259 K  |
|       from large pool |      80    |     334    |   11805 K  |   11805 K  |
|       from small pool |     780    |    1235    |   30454 K  |   30454 K  |
|---------------------------------------------------------------------------|
| GPU reserved segments |     263    |     266    |     973    |     710    |
|       from large pool |     228    |     231    |     846    |     618    |
|       from small pool |      35    |      35    |     127    |      92    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      39    |      91    |   18617 K  |   18617 K  |
|       from large pool |      19    |      25    |    2343 K  |    2343 K  |
|       from small pool |      20    |      73    |   16273 K  |   16273 K  |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

distilbert.transformer.layer.0.attention.q_lin: |A|=0.2876, |B|=0.1878, |∇A|=1.438e-05, |∇B|=1.287e-05, |LoRA(x)|=3.684e+04, B≠0=12288
distilbert.transformer.layer.0.attention.k_lin: |A|=0.2912, |B|=0.1936, |∇A|=6.889e-06, |∇B|=1.268e-05, |LoRA(x)|=4.189e+04, B≠0=12288
distilbert.transformer.layer.0.attention.v_lin: |A|=0.2807, |B|=0.1575, |∇A|=9.492e-06, |∇B|=1.722e-05, |LoRA(x)|=4.119e+04, B≠0=12288
distilbert.transformer.layer.0.attention.out_lin: |A|=0.2686, |B|=0.1646, |∇A|=1.672e-05, |∇B|=3.348e-05, |LoRA(x)|=2.419e+04, B≠0=12288
distilbert.transformer.layer.0.ffn.lin1: |A|=0.3048, |B|=0.2006, |∇A|=2.493e-05, |∇B|=1.487e-05, |LoRA(x)|=9.104e+04, B≠0=49152
distilbert.transformer.layer.0.ffn.lin2: |A|=0.2678, |B|=0.1636, |∇A|=9.089e-06, |∇B|=3.795e-05, |LoRA(x)|=4.017e+04, B≠0=12288
distilbert.transformer.layer.1.attention.q_lin: |A|=0.2823, |B|=0.1744, |∇A|=1.387e-05, |∇B|=1.468e-05, |LoRA(x)|=2.939e+04, B≠0=12288
distilbert.transformer.layer.1.attention.k_lin: |A|=0.2715, |B|=0.179, |∇A|=7.818e-06, |∇B|=1.444e-05, |LoRA(x)|=3.56e+04, B≠0=12288
distilbert.transformer.layer.1.attention.v_lin: |A|=0.2631, |B|=0.147, |∇A|=1.15e-05, |∇B|=2.431e-05, |LoRA(x)|=4.313e+04, B≠0=12288
distilbert.transformer.layer.1.attention.out_lin: |A|=0.2673, |B|=0.1493, |∇A|=1.245e-05, |∇B|=3.251e-05, |LoRA(x)|=3.406e+04, B≠0=12288
distilbert.transformer.layer.1.ffn.lin1: |A|=0.2883, |B|=0.1864, |∇A|=1.843e-05, |∇B|=1.683e-05, |LoRA(x)|=9.673e+04, B≠0=49152
distilbert.transformer.layer.1.ffn.lin2: |A|=0.2769, |B|=0.1594, |∇A|=1.305e-05, |∇B|=5.319e-05, |LoRA(x)|=2.091e+04, B≠0=12288
distilbert.transformer.layer.2.attention.q_lin: |A|=0.2743, |B|=0.1739, |∇A|=1.158e-05, |∇B|=1.88e-05, |LoRA(x)|=3.51e+04, B≠0=12288
distilbert.transformer.layer.2.attention.k_lin: |A|=0.2883, |B|=0.1874, |∇A|=1.342e-05, |∇B|=1.908e-05, |LoRA(x)|=4.053e+04, B≠0=12288
distilbert.transformer.layer.2.attention.v_lin: |A|=0.2556, |B|=0.124, |∇A|=7.334e-06, |∇B|=1.962e-05, |LoRA(x)|=5.071e+04, B≠0=12288
distilbert.transformer.layer.2.attention.out_lin: |A|=0.262, |B|=0.1546, |∇A|=2.735e-05, |∇B|=4.336e-05, |LoRA(x)|=1.624e+04, B≠0=12288
distilbert.transformer.layer.2.ffn.lin1: |A|=0.3036, |B|=0.2016, |∇A|=2.057e-05, |∇B|=1.898e-05, |LoRA(x)|=1.181e+05, B≠0=49152
distilbert.transformer.layer.2.ffn.lin2: |A|=0.2657, |B|=0.1531, |∇A|=1.202e-05, |∇B|=5.005e-05, |LoRA(x)|=3.564e+04, B≠0=12288
distilbert.transformer.layer.3.attention.q_lin: |A|=0.2763, |B|=0.1892, |∇A|=1.221e-05, |∇B|=2.084e-05, |LoRA(x)|=3.428e+04, B≠0=12288
distilbert.transformer.layer.3.attention.k_lin: |A|=0.2921, |B|=0.1972, |∇A|=2.221e-05, |∇B|=2.332e-05, |LoRA(x)|=3.677e+04, B≠0=12288
distilbert.transformer.layer.3.attention.v_lin: |A|=0.259, |B|=0.1371, |∇A|=1.384e-05, |∇B|=1.967e-05, |LoRA(x)|=3.364e+04, B≠0=12288
distilbert.transformer.layer.3.attention.out_lin: |A|=0.2623, |B|=0.1423, |∇A|=3.489e-05, |∇B|=3.26e-05, |LoRA(x)|=1.808e+04, B≠0=12288
distilbert.transformer.layer.3.ffn.lin1: |A|=0.2605, |B|=0.1675, |∇A|=2.439e-05, |∇B|=1.716e-05, |LoRA(x)|=6.808e+04, B≠0=49152
distilbert.transformer.layer.3.ffn.lin2: |A|=0.2554, |B|=0.1372, |∇A|=1.111e-05, |∇B|=4.632e-05, |LoRA(x)|=3.403e+04, B≠0=12288
distilbert.transformer.layer.4.attention.q_lin: |A|=0.2629, |B|=0.1879, |∇A|=9.482e-06, |∇B|=1.871e-05, |LoRA(x)|=3.945e+04, B≠0=12288
distilbert.transformer.layer.4.attention.k_lin: |A|=0.2739, |B|=0.1899, |∇A|=2.691e-05, |∇B|=2.026e-05, |LoRA(x)|=3.793e+04, B≠0=12288
distilbert.transformer.layer.4.attention.v_lin: |A|=0.2427, |B|=0.1229, |∇A|=9.548e-06, |∇B|=1.285e-05, |LoRA(x)|=3.232e+04, B≠0=12288
distilbert.transformer.layer.4.attention.out_lin: |A|=0.2637, |B|=0.1353, |∇A|=3.307e-05, |∇B|=2.627e-05, |LoRA(x)|=2.043e+04, B≠0=12288
distilbert.transformer.layer.4.ffn.lin1: |A|=0.2615, |B|=0.151, |∇A|=1.416e-05, |∇B|=8.211e-06, |LoRA(x)|=8.067e+04, B≠0=49152
distilbert.transformer.layer.4.ffn.lin2: |A|=0.2385, |B|=0.1152, |∇A|=1.275e-06, |∇B|=1.137e-05, |LoRA(x)|=1.052e+05, B≠0=12288
distilbert.transformer.layer.5.attention.q_lin: |A|=0.2649, |B|=0.1849, |∇A|=5.999e-06, |∇B|=1.623e-05, |LoRA(x)|=6.015e+04, B≠0=12288
distilbert.transformer.layer.5.attention.k_lin: |A|=0.2639, |B|=0.1595, |∇A|=1.905e-05, |∇B|=9.632e-06, |LoRA(x)|=4.3e+04, B≠0=12288
distilbert.transformer.layer.5.attention.v_lin: |A|=0.2426, |B|=0.1004, |∇A|=7.553e-06, |∇B|=8.706e-06, |LoRA(x)|=3.525e+04, B≠0=12288
distilbert.transformer.layer.5.attention.out_lin: |A|=0.2455, |B|=0.1125, |∇A|=1.371e-05, |∇B|=1.414e-05, |LoRA(x)|=1.956e+04, B≠0=12288
distilbert.transformer.layer.5.ffn.lin1: |A|=0.244, |B|=0.1313, |∇A|=6.42e-06, |∇B|=3.114e-06, |LoRA(x)|=6.599e+04, B≠0=49152
distilbert.transformer.layer.5.ffn.lin2: |A|=0.2351, |B|=0.1128, |∇A|=2.28e-07, |∇B|=5.508e-06, |LoRA(x)|=2.037e+05, B≠0=12288

Training Summary¶

Freezing then unfreezing layer 5 FFNs (in the hope other layers could catch up) actually didn't help

In [14]:
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.scale" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
print('Parameter Statistics: mean.abs()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(name, param.abs().mean().item())
print('Parameter Statistics: param.norm()')
for name, param in model_ddora_all_attn.named_parameters():
    if "lin.m" in name:
        print(f"{name} weight norm: {param.norm().item():.4f}")
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.287240594625473
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18772827088832855
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2909419536590576
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19361740350723267
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.280423641204834
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574709117412567
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2683144807815552
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1646396517753601
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3044332265853882
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2005985826253891
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26756101846694946
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16362197697162628
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2821109890937805
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17438463866710663
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2712245583534241
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17899440228939056
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26285862922668457
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14696842432022095
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2671390771865845
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14929549396038055
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.28791946172714233
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1863688826560974
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27668440341949463
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15934498608112335
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2741093635559082
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17391812801361084
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28805288672447205
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1873616874217987
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25546205043792725
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12401476502418518
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26189106702804565
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15452931821346283
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.30333781242370605
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20153628289699554
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2654007375240326
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15304669737815857
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2759910225868225
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1892491579055786
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29188308119773865
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718174636363983
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2588292062282562
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704168796539307
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2620876431465149
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226451516151428
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.26023590564727783
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748926043510437
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25515085458755493
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720864057540894
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2625563144683838
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.18786773085594177
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2736532986164093
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18987171351909637
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2424963414669037
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12292399257421494
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2634068727493286
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13521724939346313
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2611706852912903
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097293257713318
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23863813281059265
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11519813537597656
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26431819796562195
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1849026381969452
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.26360854506492615
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1594821661710739
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24227461218833923
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.1004873663187027
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24558258056640625
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11250372976064682
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.2434004843235016
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1312120109796524
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.2350156307220459
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11283187568187714
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.lora.A weight norm: 40.5990
distilbert.transformer.layer.0.attention.q_lin.lora.B weight norm: 26.4484
distilbert.transformer.layer.0.attention.k_lin.lora.A weight norm: 41.0335
distilbert.transformer.layer.0.attention.k_lin.lora.B weight norm: 27.1989
distilbert.transformer.layer.0.attention.v_lin.lora.A weight norm: 39.8708
distilbert.transformer.layer.0.attention.v_lin.lora.B weight norm: 22.4580
distilbert.transformer.layer.0.attention.out_lin.lora.A weight norm: 38.2828
distilbert.transformer.layer.0.attention.out_lin.lora.B weight norm: 23.3685
distilbert.transformer.layer.0.ffn.lin1.lora.A weight norm: 43.3373
distilbert.transformer.layer.0.ffn.lin1.lora.B weight norm: 56.7995
distilbert.transformer.layer.0.ffn.lin2.lora.A weight norm: 76.9791
distilbert.transformer.layer.0.ffn.lin2.lora.B weight norm: 23.3217
distilbert.transformer.layer.1.attention.q_lin.lora.A weight norm: 40.1453
distilbert.transformer.layer.1.attention.q_lin.lora.B weight norm: 24.7263
distilbert.transformer.layer.1.attention.k_lin.lora.A weight norm: 38.3354
distilbert.transformer.layer.1.attention.k_lin.lora.B weight norm: 25.1513
distilbert.transformer.layer.1.attention.v_lin.lora.A weight norm: 37.3621
distilbert.transformer.layer.1.attention.v_lin.lora.B weight norm: 20.9161
distilbert.transformer.layer.1.attention.out_lin.lora.A weight norm: 38.1554
distilbert.transformer.layer.1.attention.out_lin.lora.B weight norm: 21.3214
distilbert.transformer.layer.1.ffn.lin1.lora.A weight norm: 40.6530
distilbert.transformer.layer.1.ffn.lin1.lora.B weight norm: 52.8055
distilbert.transformer.layer.1.ffn.lin2.lora.A weight norm: 79.4976
distilbert.transformer.layer.1.ffn.lin2.lora.B weight norm: 22.7980
distilbert.transformer.layer.2.attention.q_lin.lora.A weight norm: 38.8030
distilbert.transformer.layer.2.attention.q_lin.lora.B weight norm: 24.5728
distilbert.transformer.layer.2.attention.k_lin.lora.A weight norm: 40.7285
distilbert.transformer.layer.2.attention.k_lin.lora.B weight norm: 26.4093
distilbert.transformer.layer.2.attention.v_lin.lora.A weight norm: 36.4713
distilbert.transformer.layer.2.attention.v_lin.lora.B weight norm: 17.7762
distilbert.transformer.layer.2.attention.out_lin.lora.A weight norm: 37.4290
distilbert.transformer.layer.2.attention.out_lin.lora.B weight norm: 22.0014
distilbert.transformer.layer.2.ffn.lin1.lora.A weight norm: 43.0048
distilbert.transformer.layer.2.ffn.lin1.lora.B weight norm: 57.3172
distilbert.transformer.layer.2.ffn.lin2.lora.A weight norm: 76.4310
distilbert.transformer.layer.2.ffn.lin2.lora.B weight norm: 22.0275
distilbert.transformer.layer.3.attention.q_lin.lora.A weight norm: 39.1868
distilbert.transformer.layer.3.attention.q_lin.lora.B weight norm: 26.6021
distilbert.transformer.layer.3.attention.k_lin.lora.A weight norm: 41.4129
distilbert.transformer.layer.3.attention.k_lin.lora.B weight norm: 27.7546
distilbert.transformer.layer.3.attention.v_lin.lora.A weight norm: 36.8722
distilbert.transformer.layer.3.attention.v_lin.lora.B weight norm: 19.7314
distilbert.transformer.layer.3.attention.out_lin.lora.A weight norm: 37.3789
distilbert.transformer.layer.3.attention.out_lin.lora.B weight norm: 20.2694
distilbert.transformer.layer.3.ffn.lin1.lora.A weight norm: 36.8746
distilbert.transformer.layer.3.ffn.lin1.lora.B weight norm: 47.5508
distilbert.transformer.layer.3.ffn.lin2.lora.A weight norm: 73.5625
distilbert.transformer.layer.3.ffn.lin2.lora.B weight norm: 19.6110
distilbert.transformer.layer.4.attention.q_lin.lora.A weight norm: 37.2517
distilbert.transformer.layer.4.attention.q_lin.lora.B weight norm: 26.5313
distilbert.transformer.layer.4.attention.k_lin.lora.A weight norm: 38.8101
distilbert.transformer.layer.4.attention.k_lin.lora.B weight norm: 26.8539
distilbert.transformer.layer.4.attention.v_lin.lora.A weight norm: 34.4397
distilbert.transformer.layer.4.attention.v_lin.lora.B weight norm: 17.5436
distilbert.transformer.layer.4.attention.out_lin.lora.A weight norm: 37.8973
distilbert.transformer.layer.4.attention.out_lin.lora.B weight norm: 19.4158
distilbert.transformer.layer.4.ffn.lin1.lora.A weight norm: 36.8792
distilbert.transformer.layer.4.ffn.lin1.lora.B weight norm: 42.9426
distilbert.transformer.layer.4.ffn.lin2.lora.A weight norm: 67.2702
distilbert.transformer.layer.4.ffn.lin2.lora.B weight norm: 16.7655
distilbert.transformer.layer.5.attention.q_lin.lora.A weight norm: 37.5755
distilbert.transformer.layer.5.attention.q_lin.lora.B weight norm: 26.1897
distilbert.transformer.layer.5.attention.k_lin.lora.A weight norm: 37.4424
distilbert.transformer.layer.5.attention.k_lin.lora.B weight norm: 22.9606
distilbert.transformer.layer.5.attention.v_lin.lora.A weight norm: 34.5195
distilbert.transformer.layer.5.attention.v_lin.lora.B weight norm: 14.7072
distilbert.transformer.layer.5.attention.out_lin.lora.A weight norm: 34.6669
distilbert.transformer.layer.5.attention.out_lin.lora.B weight norm: 16.2908
distilbert.transformer.layer.5.ffn.lin1.lora.A weight norm: 34.6663
distilbert.transformer.layer.5.ffn.lin1.lora.B weight norm: 38.2804
distilbert.transformer.layer.5.ffn.lin2.lora.A weight norm: 65.7187
distilbert.transformer.layer.5.ffn.lin2.lora.B weight norm: 16.5426
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.scale_out 2.078249931335449
distilbert.transformer.layer.0.attention.q_lin.scale_in 1.9629707336425781
distilbert.transformer.layer.0.attention.k_lin.scale_out 2.0608773231506348
distilbert.transformer.layer.0.attention.k_lin.scale_in 1.9582805633544922
distilbert.transformer.layer.0.attention.v_lin.scale_out 1.936693549156189
distilbert.transformer.layer.0.attention.v_lin.scale_in 1.963030219078064
distilbert.transformer.layer.0.attention.out_lin.scale_out 1.9090875387191772
distilbert.transformer.layer.0.attention.out_lin.scale_in 1.9155340194702148
distilbert.transformer.layer.1.attention.q_lin.scale_out 2.004789352416992
distilbert.transformer.layer.1.attention.q_lin.scale_in 1.9639787673950195
distilbert.transformer.layer.1.attention.k_lin.scale_out 1.9920339584350586
distilbert.transformer.layer.1.attention.k_lin.scale_in 1.974652647972107
distilbert.transformer.layer.1.attention.v_lin.scale_out 1.9344266653060913
distilbert.transformer.layer.1.attention.v_lin.scale_in 1.9241001605987549
distilbert.transformer.layer.1.attention.out_lin.scale_out 1.9053921699523926
distilbert.transformer.layer.1.attention.out_lin.scale_in 1.9328597784042358
distilbert.transformer.layer.2.attention.q_lin.scale_out 2.0111687183380127
distilbert.transformer.layer.2.attention.q_lin.scale_in 1.9665979146957397
distilbert.transformer.layer.2.attention.k_lin.scale_out 2.0546875
distilbert.transformer.layer.2.attention.k_lin.scale_in 1.987182855606079
distilbert.transformer.layer.2.attention.v_lin.scale_out 1.920434832572937
distilbert.transformer.layer.2.attention.v_lin.scale_in 1.9206979274749756
distilbert.transformer.layer.2.attention.out_lin.scale_out 1.900865912437439
distilbert.transformer.layer.2.attention.out_lin.scale_in 1.908034324645996
distilbert.transformer.layer.3.attention.q_lin.scale_out 2.02260160446167
distilbert.transformer.layer.3.attention.q_lin.scale_in 1.9500030279159546
distilbert.transformer.layer.3.attention.k_lin.scale_out 2.0796260833740234
distilbert.transformer.layer.3.attention.k_lin.scale_in 1.943003535270691
distilbert.transformer.layer.3.attention.v_lin.scale_out 1.9345171451568604
distilbert.transformer.layer.3.attention.v_lin.scale_in 1.9029006958007812
distilbert.transformer.layer.3.attention.out_lin.scale_out 1.9271156787872314
distilbert.transformer.layer.3.attention.out_lin.scale_in 1.919654130935669
distilbert.transformer.layer.4.attention.q_lin.scale_out 2.04514741897583
distilbert.transformer.layer.4.attention.q_lin.scale_in 1.9204539060592651
distilbert.transformer.layer.4.attention.k_lin.scale_out 2.066929340362549
distilbert.transformer.layer.4.attention.k_lin.scale_in 1.9324769973754883
distilbert.transformer.layer.4.attention.v_lin.scale_out 1.829803466796875
distilbert.transformer.layer.4.attention.v_lin.scale_in 1.9489554166793823
distilbert.transformer.layer.4.attention.out_lin.scale_out 1.9196603298187256
distilbert.transformer.layer.4.attention.out_lin.scale_in 1.8496103286743164
distilbert.transformer.layer.5.attention.q_lin.scale_out 1.94928777217865
distilbert.transformer.layer.5.attention.q_lin.scale_in 1.9072329998016357
distilbert.transformer.layer.5.attention.k_lin.scale_out 2.0761525630950928
distilbert.transformer.layer.5.attention.k_lin.scale_in 1.9125897884368896
distilbert.transformer.layer.5.attention.v_lin.scale_out 1.8967959880828857
distilbert.transformer.layer.5.attention.v_lin.scale_in 1.904982566833496
distilbert.transformer.layer.5.attention.out_lin.scale_out 1.8897030353546143
distilbert.transformer.layer.5.attention.out_lin.scale_in 1.9557445049285889
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.scale_out weight norm: 58.5368
distilbert.transformer.layer.0.attention.q_lin.scale_in weight norm: 55.0002
distilbert.transformer.layer.0.attention.k_lin.scale_out weight norm: 58.0945
distilbert.transformer.layer.0.attention.k_lin.scale_in weight norm: 54.8807
distilbert.transformer.layer.0.attention.v_lin.scale_out weight norm: 54.3976
distilbert.transformer.layer.0.attention.v_lin.scale_in weight norm: 55.0108
distilbert.transformer.layer.0.attention.out_lin.scale_out weight norm: 53.5766
distilbert.transformer.layer.0.attention.out_lin.scale_in weight norm: 53.7535
distilbert.transformer.layer.1.attention.q_lin.scale_out weight norm: 56.4095
distilbert.transformer.layer.1.attention.q_lin.scale_in weight norm: 55.0503
distilbert.transformer.layer.1.attention.k_lin.scale_out weight norm: 55.9582
distilbert.transformer.layer.1.attention.k_lin.scale_in weight norm: 55.2007
distilbert.transformer.layer.1.attention.v_lin.scale_out weight norm: 54.1290
distilbert.transformer.layer.1.attention.v_lin.scale_in weight norm: 53.8809
distilbert.transformer.layer.1.attention.out_lin.scale_out weight norm: 53.3206
distilbert.transformer.layer.1.attention.out_lin.scale_in weight norm: 54.1953
distilbert.transformer.layer.2.attention.q_lin.scale_out weight norm: 56.3965
distilbert.transformer.layer.2.attention.q_lin.scale_in weight norm: 55.0122
distilbert.transformer.layer.2.attention.k_lin.scale_out weight norm: 57.7370
distilbert.transformer.layer.2.attention.k_lin.scale_in weight norm: 55.6521
distilbert.transformer.layer.2.attention.v_lin.scale_out weight norm: 53.7029
distilbert.transformer.layer.2.attention.v_lin.scale_in weight norm: 53.8140
distilbert.transformer.layer.2.attention.out_lin.scale_out weight norm: 53.3594
distilbert.transformer.layer.2.attention.out_lin.scale_in weight norm: 53.4964
distilbert.transformer.layer.3.attention.q_lin.scale_out weight norm: 56.8060
distilbert.transformer.layer.3.attention.q_lin.scale_in weight norm: 54.5576
distilbert.transformer.layer.3.attention.k_lin.scale_out weight norm: 58.4434
distilbert.transformer.layer.3.attention.k_lin.scale_in weight norm: 54.5110
distilbert.transformer.layer.3.attention.v_lin.scale_out weight norm: 54.1719
distilbert.transformer.layer.3.attention.v_lin.scale_in weight norm: 53.3051
distilbert.transformer.layer.3.attention.out_lin.scale_out weight norm: 54.0121
distilbert.transformer.layer.3.attention.out_lin.scale_in weight norm: 53.7271
distilbert.transformer.layer.4.attention.q_lin.scale_out weight norm: 57.5159
distilbert.transformer.layer.4.attention.q_lin.scale_in weight norm: 53.7450
distilbert.transformer.layer.4.attention.k_lin.scale_out weight norm: 58.1790
distilbert.transformer.layer.4.attention.k_lin.scale_in weight norm: 54.0628
distilbert.transformer.layer.4.attention.v_lin.scale_out weight norm: 51.6230
distilbert.transformer.layer.4.attention.v_lin.scale_in weight norm: 54.3730
distilbert.transformer.layer.4.attention.out_lin.scale_out weight norm: 54.0633
distilbert.transformer.layer.4.attention.out_lin.scale_in weight norm: 52.0945
distilbert.transformer.layer.5.attention.q_lin.scale_out weight norm: 55.0960
distilbert.transformer.layer.5.attention.q_lin.scale_in weight norm: 53.4967
distilbert.transformer.layer.5.attention.k_lin.scale_out weight norm: 58.2587
distilbert.transformer.layer.5.attention.k_lin.scale_in weight norm: 53.5461
distilbert.transformer.layer.5.attention.v_lin.scale_out weight norm: 53.3409
distilbert.transformer.layer.5.attention.v_lin.scale_in weight norm: 53.3365
distilbert.transformer.layer.5.attention.out_lin.scale_out weight norm: 53.2633
distilbert.transformer.layer.5.attention.out_lin.scale_in weight norm: 54.5414
Parameter Statistics: mean.abs()
distilbert.transformer.layer.0.attention.q_lin.m_out 0.3918760418891907
distilbert.transformer.layer.0.attention.q_lin.m_in 0.2821083962917328
distilbert.transformer.layer.0.attention.k_lin.m_out 0.388096421957016
distilbert.transformer.layer.0.attention.k_lin.m_in 0.2891788184642792
distilbert.transformer.layer.0.attention.v_lin.m_out 0.2867240905761719
distilbert.transformer.layer.0.attention.v_lin.m_in 0.28405582904815674
distilbert.transformer.layer.0.attention.out_lin.m_out 0.26317834854125977
distilbert.transformer.layer.0.attention.out_lin.m_in 0.25337088108062744
distilbert.transformer.layer.1.attention.q_lin.m_out 0.3328227698802948
distilbert.transformer.layer.1.attention.q_lin.m_in 0.27658596634864807
distilbert.transformer.layer.1.attention.k_lin.m_out 0.3300029933452606
distilbert.transformer.layer.1.attention.k_lin.m_in 0.27019354701042175
distilbert.transformer.layer.1.attention.v_lin.m_out 0.2650151252746582
distilbert.transformer.layer.1.attention.v_lin.m_in 0.2554093599319458
distilbert.transformer.layer.1.attention.out_lin.m_out 0.25520753860473633
distilbert.transformer.layer.1.attention.out_lin.m_in 0.25274693965911865
distilbert.transformer.layer.2.attention.q_lin.m_out 0.32323920726776123
distilbert.transformer.layer.2.attention.q_lin.m_in 0.26473814249038696
distilbert.transformer.layer.2.attention.k_lin.m_out 0.3654788136482239
distilbert.transformer.layer.2.attention.k_lin.m_in 0.29941341280937195
distilbert.transformer.layer.2.attention.v_lin.m_out 0.24500706791877747
distilbert.transformer.layer.2.attention.v_lin.m_in 0.24774298071861267
distilbert.transformer.layer.2.attention.out_lin.m_out 0.26631924510002136
distilbert.transformer.layer.2.attention.out_lin.m_in 0.23847880959510803
distilbert.transformer.layer.3.attention.q_lin.m_out 0.34433692693710327
distilbert.transformer.layer.3.attention.q_lin.m_in 0.2763220965862274
distilbert.transformer.layer.3.attention.k_lin.m_out 0.38676881790161133
distilbert.transformer.layer.3.attention.k_lin.m_in 0.28456243872642517
distilbert.transformer.layer.3.attention.v_lin.m_out 0.25427114963531494
distilbert.transformer.layer.3.attention.v_lin.m_in 0.23790866136550903
distilbert.transformer.layer.3.attention.out_lin.m_out 0.2615028917789459
distilbert.transformer.layer.3.attention.out_lin.m_in 0.25258609652519226
distilbert.transformer.layer.4.attention.q_lin.m_out 0.35477954149246216
distilbert.transformer.layer.4.attention.q_lin.m_in 0.2452540099620819
distilbert.transformer.layer.4.attention.k_lin.m_out 0.3896406590938568
distilbert.transformer.layer.4.attention.k_lin.m_in 0.25737807154655457
distilbert.transformer.layer.4.attention.v_lin.m_out 0.22937028110027313
distilbert.transformer.layer.4.attention.v_lin.m_in 0.24845485389232635
distilbert.transformer.layer.4.attention.out_lin.m_out 0.278006374835968
distilbert.transformer.layer.4.attention.out_lin.m_in 0.2266145646572113
distilbert.transformer.layer.5.attention.q_lin.m_out 0.32691702246665955
distilbert.transformer.layer.5.attention.q_lin.m_in 0.25372225046157837
distilbert.transformer.layer.5.attention.k_lin.m_out 0.38233980536460876
distilbert.transformer.layer.5.attention.k_lin.m_in 0.24921034276485443
distilbert.transformer.layer.5.attention.v_lin.m_out 0.25670918822288513
distilbert.transformer.layer.5.attention.v_lin.m_in 0.22960709035396576
distilbert.transformer.layer.5.attention.out_lin.m_out 0.2548404633998871
distilbert.transformer.layer.5.attention.out_lin.m_in 0.22926835715770721
Parameter Statistics: param.norm()
distilbert.transformer.layer.0.attention.q_lin.m_out weight norm: 13.5111
distilbert.transformer.layer.0.attention.q_lin.m_in weight norm: 10.3667
distilbert.transformer.layer.0.attention.k_lin.m_out weight norm: 13.5175
distilbert.transformer.layer.0.attention.k_lin.m_in weight norm: 10.3886
distilbert.transformer.layer.0.attention.v_lin.m_out weight norm: 10.3022
distilbert.transformer.layer.0.attention.v_lin.m_in weight norm: 10.2630
distilbert.transformer.layer.0.attention.out_lin.m_out weight norm: 9.6783
distilbert.transformer.layer.0.attention.out_lin.m_in weight norm: 9.7998
distilbert.transformer.layer.1.attention.q_lin.m_out weight norm: 11.8675
distilbert.transformer.layer.1.attention.q_lin.m_in weight norm: 10.2858
distilbert.transformer.layer.1.attention.k_lin.m_out weight norm: 11.5865
distilbert.transformer.layer.1.attention.k_lin.m_in weight norm: 9.6285
distilbert.transformer.layer.1.attention.v_lin.m_out weight norm: 9.4300
distilbert.transformer.layer.1.attention.v_lin.m_in weight norm: 9.7012
distilbert.transformer.layer.1.attention.out_lin.m_out weight norm: 9.0223
distilbert.transformer.layer.1.attention.out_lin.m_in weight norm: 9.8704
distilbert.transformer.layer.2.attention.q_lin.m_out weight norm: 11.3432
distilbert.transformer.layer.2.attention.q_lin.m_in weight norm: 9.6139
distilbert.transformer.layer.2.attention.k_lin.m_out weight norm: 12.5924
distilbert.transformer.layer.2.attention.k_lin.m_in weight norm: 10.6472
distilbert.transformer.layer.2.attention.v_lin.m_out weight norm: 9.0080
distilbert.transformer.layer.2.attention.v_lin.m_in weight norm: 9.6870
distilbert.transformer.layer.2.attention.out_lin.m_out weight norm: 9.4817
distilbert.transformer.layer.2.attention.out_lin.m_in weight norm: 9.2836
distilbert.transformer.layer.3.attention.q_lin.m_out weight norm: 11.9146
distilbert.transformer.layer.3.attention.q_lin.m_in weight norm: 9.8234
distilbert.transformer.layer.3.attention.k_lin.m_out weight norm: 13.1844
distilbert.transformer.layer.3.attention.k_lin.m_in weight norm: 10.4047
distilbert.transformer.layer.3.attention.v_lin.m_out weight norm: 9.1708
distilbert.transformer.layer.3.attention.v_lin.m_in weight norm: 9.2959
distilbert.transformer.layer.3.attention.out_lin.m_out weight norm: 9.2903
distilbert.transformer.layer.3.attention.out_lin.m_in weight norm: 9.7511
distilbert.transformer.layer.4.attention.q_lin.m_out weight norm: 12.6141
distilbert.transformer.layer.4.attention.q_lin.m_in weight norm: 9.1202
distilbert.transformer.layer.4.attention.k_lin.m_out weight norm: 13.4419
distilbert.transformer.layer.4.attention.k_lin.m_in weight norm: 9.4171
distilbert.transformer.layer.4.attention.v_lin.m_out weight norm: 8.4905
distilbert.transformer.layer.4.attention.v_lin.m_in weight norm: 9.1461
distilbert.transformer.layer.4.attention.out_lin.m_out weight norm: 10.0894
distilbert.transformer.layer.4.attention.out_lin.m_in weight norm: 9.3265
distilbert.transformer.layer.5.attention.q_lin.m_out weight norm: 11.9401
distilbert.transformer.layer.5.attention.q_lin.m_in weight norm: 9.5767
distilbert.transformer.layer.5.attention.k_lin.m_out weight norm: 12.9022
distilbert.transformer.layer.5.attention.k_lin.m_in weight norm: 9.2130
distilbert.transformer.layer.5.attention.v_lin.m_out weight norm: 9.3175
distilbert.transformer.layer.5.attention.v_lin.m_in weight norm: 9.0608
distilbert.transformer.layer.5.attention.out_lin.m_out weight norm: 9.4061
distilbert.transformer.layer.5.attention.out_lin.m_in weight norm: 8.4106
In [16]:
import torch.nn.init as init

for name, module in model_ddora_all_attn.named_modules():
    if "transformer.layer.5" in name and hasattr(module, "lora_A") and hasattr(module, "lora_B"):
        print(f"Resetting LoRA weights in: {name}")
        init.normal_(module.lora_A.weight, mean=0.0, std=0.01)
        init.normal_(module.lora_B.weight, mean=0.0, std=0.01)
        if hasattr(module, "m_in"):
            init.normal_(module.m_in, mean=0.0, std=0.01)
        if hasattr(module, "m_out"):
            init.normal_(module.m_out, mean=0.0, std=0.01)
for name, module in model_ddora_all_attn.named_modules():
    if "transformer.layer.5" in name and hasattr(module, "lora_A"):
        A_std = module.lora_A.weight.std().item()
        B_std = module.lora_B.weight.std().item()
        print(f"{name}: |A|={module.lora_A.weight.abs().mean():.4f}, std={A_std:.4f} |B|={module.lora_B.weight.abs().mean():.4f}, std={B_std:.4f}")
In [17]:
for name, param in model_ddora_all_attn.named_parameters():
    if "lora" in name:
        print(name, param.abs().mean().item())
distilbert.transformer.layer.0.attention.q_lin.lora.A 0.287240594625473
distilbert.transformer.layer.0.attention.q_lin.lora.B 0.18772827088832855
distilbert.transformer.layer.0.attention.k_lin.lora.A 0.2909419536590576
distilbert.transformer.layer.0.attention.k_lin.lora.B 0.19361740350723267
distilbert.transformer.layer.0.attention.v_lin.lora.A 0.280423641204834
distilbert.transformer.layer.0.attention.v_lin.lora.B 0.1574709117412567
distilbert.transformer.layer.0.attention.out_lin.lora.A 0.2683144807815552
distilbert.transformer.layer.0.attention.out_lin.lora.B 0.1646396517753601
distilbert.transformer.layer.0.ffn.lin1.lora.A 0.3044332265853882
distilbert.transformer.layer.0.ffn.lin1.lora.B 0.2005985826253891
distilbert.transformer.layer.0.ffn.lin2.lora.A 0.26756101846694946
distilbert.transformer.layer.0.ffn.lin2.lora.B 0.16362197697162628
distilbert.transformer.layer.1.attention.q_lin.lora.A 0.2821109890937805
distilbert.transformer.layer.1.attention.q_lin.lora.B 0.17438463866710663
distilbert.transformer.layer.1.attention.k_lin.lora.A 0.2712245583534241
distilbert.transformer.layer.1.attention.k_lin.lora.B 0.17899440228939056
distilbert.transformer.layer.1.attention.v_lin.lora.A 0.26285862922668457
distilbert.transformer.layer.1.attention.v_lin.lora.B 0.14696842432022095
distilbert.transformer.layer.1.attention.out_lin.lora.A 0.2671390771865845
distilbert.transformer.layer.1.attention.out_lin.lora.B 0.14929549396038055
distilbert.transformer.layer.1.ffn.lin1.lora.A 0.28791946172714233
distilbert.transformer.layer.1.ffn.lin1.lora.B 0.1863688826560974
distilbert.transformer.layer.1.ffn.lin2.lora.A 0.27668440341949463
distilbert.transformer.layer.1.ffn.lin2.lora.B 0.15934498608112335
distilbert.transformer.layer.2.attention.q_lin.lora.A 0.2741093635559082
distilbert.transformer.layer.2.attention.q_lin.lora.B 0.17391812801361084
distilbert.transformer.layer.2.attention.k_lin.lora.A 0.28805288672447205
distilbert.transformer.layer.2.attention.k_lin.lora.B 0.1873616874217987
distilbert.transformer.layer.2.attention.v_lin.lora.A 0.25546205043792725
distilbert.transformer.layer.2.attention.v_lin.lora.B 0.12401476502418518
distilbert.transformer.layer.2.attention.out_lin.lora.A 0.26189106702804565
distilbert.transformer.layer.2.attention.out_lin.lora.B 0.15452931821346283
distilbert.transformer.layer.2.ffn.lin1.lora.A 0.30333781242370605
distilbert.transformer.layer.2.ffn.lin1.lora.B 0.20153628289699554
distilbert.transformer.layer.2.ffn.lin2.lora.A 0.2654007375240326
distilbert.transformer.layer.2.ffn.lin2.lora.B 0.15304669737815857
distilbert.transformer.layer.3.attention.q_lin.lora.A 0.2759910225868225
distilbert.transformer.layer.3.attention.q_lin.lora.B 0.1892491579055786
distilbert.transformer.layer.3.attention.k_lin.lora.A 0.29188308119773865
distilbert.transformer.layer.3.attention.k_lin.lora.B 0.19718174636363983
distilbert.transformer.layer.3.attention.v_lin.lora.A 0.2588292062282562
distilbert.transformer.layer.3.attention.v_lin.lora.B 0.13704168796539307
distilbert.transformer.layer.3.attention.out_lin.lora.A 0.2620876431465149
distilbert.transformer.layer.3.attention.out_lin.lora.B 0.14226451516151428
distilbert.transformer.layer.3.ffn.lin1.lora.A 0.26023590564727783
distilbert.transformer.layer.3.ffn.lin1.lora.B 0.16748926043510437
distilbert.transformer.layer.3.ffn.lin2.lora.A 0.25515085458755493
distilbert.transformer.layer.3.ffn.lin2.lora.B 0.13720864057540894
distilbert.transformer.layer.4.attention.q_lin.lora.A 0.2625563144683838
distilbert.transformer.layer.4.attention.q_lin.lora.B 0.18786773085594177
distilbert.transformer.layer.4.attention.k_lin.lora.A 0.2736532986164093
distilbert.transformer.layer.4.attention.k_lin.lora.B 0.18987171351909637
distilbert.transformer.layer.4.attention.v_lin.lora.A 0.2424963414669037
distilbert.transformer.layer.4.attention.v_lin.lora.B 0.12292399257421494
distilbert.transformer.layer.4.attention.out_lin.lora.A 0.2634068727493286
distilbert.transformer.layer.4.attention.out_lin.lora.B 0.13521724939346313
distilbert.transformer.layer.4.ffn.lin1.lora.A 0.2611706852912903
distilbert.transformer.layer.4.ffn.lin1.lora.B 0.15097293257713318
distilbert.transformer.layer.4.ffn.lin2.lora.A 0.23863813281059265
distilbert.transformer.layer.4.ffn.lin2.lora.B 0.11519813537597656
distilbert.transformer.layer.5.attention.q_lin.lora.A 0.26431819796562195
distilbert.transformer.layer.5.attention.q_lin.lora.B 0.1849026381969452
distilbert.transformer.layer.5.attention.k_lin.lora.A 0.26360854506492615
distilbert.transformer.layer.5.attention.k_lin.lora.B 0.1594821661710739
distilbert.transformer.layer.5.attention.v_lin.lora.A 0.24227461218833923
distilbert.transformer.layer.5.attention.v_lin.lora.B 0.1004873663187027
distilbert.transformer.layer.5.attention.out_lin.lora.A 0.24558258056640625
distilbert.transformer.layer.5.attention.out_lin.lora.B 0.11250372976064682
distilbert.transformer.layer.5.ffn.lin1.lora.A 0.2434004843235016
distilbert.transformer.layer.5.ffn.lin1.lora.B 0.1312120109796524
distilbert.transformer.layer.5.ffn.lin2.lora.A 0.2350156307220459
distilbert.transformer.layer.5.ffn.lin2.lora.B 0.11283187568187714
In [18]:
for name, module in model_ddora_all_attn.named_modules():
    if "transformer.layer.5" in name:
        if hasattr(module, "lora_A"):
            print(f"{name} has lora_A")
        if hasattr(module, "lora_B"):
            print(f"{name} has lora_B")
In [19]:
# Manually traverse to layer 5
layer5 = model_ddora_all_attn.distilbert.transformer.layer[5]

# Now inspect submodules
for name, module in layer5.named_modules():
    if hasattr(module, "lora_A"):
        print(f"{name} has lora_A")
    if hasattr(module, "lora_B"):
        print(f"{name} has lora_B")
In [20]:
layer5 = model_ddora_all_attn.distilbert.transformer.layer[5]

for name, module in layer5.named_modules():
    print(name, type(module))
for name, module in model_ddora_all_attn.named_modules():
    if hasattr(module, "lora_A"):
        print(name)
 <class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'>
attention <class 'transformers.models.distilbert.modeling_distilbert.DistilBertSdpaAttention'>
attention.dropout <class 'torch.nn.modules.dropout.Dropout'>
attention.q_lin <class '__main__.LinearWithDoubleDoRA'>
attention.q_lin.linear <class 'torch.nn.modules.linear.Linear'>
attention.q_lin.lora <class '__main__.LoRALayer'>
attention.q_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
attention.k_lin <class '__main__.LinearWithDoubleDoRA'>
attention.k_lin.linear <class 'torch.nn.modules.linear.Linear'>
attention.k_lin.lora <class '__main__.LoRALayer'>
attention.k_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
attention.v_lin <class '__main__.LinearWithDoubleDoRA'>
attention.v_lin.linear <class 'torch.nn.modules.linear.Linear'>
attention.v_lin.lora <class '__main__.LoRALayer'>
attention.v_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
attention.out_lin <class '__main__.LinearWithDoubleDoRA'>
attention.out_lin.linear <class 'torch.nn.modules.linear.Linear'>
attention.out_lin.lora <class '__main__.LoRALayer'>
attention.out_lin.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
sa_layer_norm <class 'torch.nn.modules.normalization.LayerNorm'>
ffn <class 'transformers.models.distilbert.modeling_distilbert.FFN'>
ffn.dropout <class 'torch.nn.modules.dropout.Dropout'>
ffn.lin1 <class '__main__.LinearWithDoubleDoRA'>
ffn.lin1.linear <class 'torch.nn.modules.linear.Linear'>
ffn.lin1.lora <class '__main__.LoRALayer'>
ffn.lin1.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
ffn.lin2 <class '__main__.LinearWithDoubleDoRA'>
ffn.lin2.linear <class 'torch.nn.modules.linear.Linear'>
ffn.lin2.lora <class '__main__.LoRALayer'>
ffn.lin2.lora.dropout <class 'torch.nn.modules.dropout.Dropout'>
ffn.activation <class 'transformers.activations.GELUActivation'>
output_layer_norm <class 'torch.nn.modules.normalization.LayerNorm'>
In [21]:
for name, module in layer5.named_modules():
    if hasattr(module, "lora"):
        if hasattr(module.lora, "lora_A"):
            print(f"{name} has lora_A")
        if hasattr(module.lora, "lora_B"):
            print(f"{name} has lora_B")
In [22]:
for name, module in layer5.named_modules():
    if hasattr(module, "lora"):
        print(f"{name} has LoRA module with params:", list(vars(module.lora).keys()))
attention.q_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
attention.k_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
attention.v_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
attention.out_lin has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
ffn.lin1 has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
ffn.lin2 has LoRA module with params: ['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'alpha']
In [23]:
for name, module in layer5.named_modules():
    if hasattr(module, "lora"):
        lora_params = dict(module.lora.named_parameters())
        if "lora_A" in lora_params:
            print(f"{name} has lora_A")
        if "lora_B" in lora_params:
            print(f"{name} has lora_B")
In [ ]: