From 5b2b84fa273db654fb2b78dde5503cc094b0f050 Mon Sep 17 00:00:00 2001
From: Riko Uphoff <riko.uphoff@student.uni-halle.de>
Date: Sun, 30 Mar 2025 15:06:52 +0200
Subject: [PATCH] Added Galore and Lora parameters to args; Added Datasets to
 args; Added training scripts;

---
 args.py                                       | 24 +++++++----
 load_data.py                                  | 17 +++++++-
 load_optimizers.py                            | 26 +++++++++---
 scripts/shell/finetune_roberta_galore_cola.sh | 27 ++++++++++++
 scripts/shell/finetune_roberta_galore_mnli.sh | 27 ++++++++++++
 scripts/shell/finetune_roberta_galore_mrpc.sh | 27 ++++++++++++
 scripts/shell/finetune_roberta_galore_qnli.sh | 27 ++++++++++++
 scripts/shell/finetune_roberta_galore_qqp.sh  | 27 ++++++++++++
 scripts/shell/finetune_roberta_galore_rte.sh  | 27 ++++++++++++
 .../shell/finetune_roberta_galore_sst-2.sh    | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_cola.sh   | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_mnli.sh   | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_mrpc.sh   | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_qnli.sh   | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_qqp.sh    | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_rte.sh    | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_sst-2.sh  | 27 ++++++++++++
 scripts/shell/finetune_roberta_lora_sts-b.sh  | 27 ++++++++++++
 scripts/shell/pretrain_60m.sh                 | 41 +++++++++++--------
 scripts/shell/pretrain_7b.sh                  | 41 +++++++++++--------
 20 files changed, 509 insertions(+), 45 deletions(-)
 create mode 100644 scripts/shell/finetune_roberta_galore_cola.sh
 create mode 100755 scripts/shell/finetune_roberta_galore_mnli.sh
 create mode 100644 scripts/shell/finetune_roberta_galore_mrpc.sh
 create mode 100644 scripts/shell/finetune_roberta_galore_qnli.sh
 create mode 100644 scripts/shell/finetune_roberta_galore_qqp.sh
 create mode 100644 scripts/shell/finetune_roberta_galore_rte.sh
 create mode 100755 scripts/shell/finetune_roberta_galore_sst-2.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_cola.sh
 create mode 100755 scripts/shell/finetune_roberta_lora_mnli.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_mrpc.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_qnli.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_qqp.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_rte.sh
 create mode 100755 scripts/shell/finetune_roberta_lora_sst-2.sh
 create mode 100644 scripts/shell/finetune_roberta_lora_sts-b.sh

diff --git a/args.py b/args.py
index 6f40cf4..70393b6 100644
--- a/args.py
+++ b/args.py
@@ -2,21 +2,31 @@ import argparse
 
 parser = argparse.ArgumentParser(description="Run training")
 parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use")
+parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
+parser.add_argument("--dataset", type=str, choices=["c4", "glue_mnli", "glue_sst-2", "glue_mrpc", "glue_cola", "glue_qnli", "glue_qqp", "glue_rte", "glue_sts-b"], required=True, help="Model to use")
+
 parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use")
+parser.add_argument("--weight_decay", type=float, default=0, help="Weight decay for optimizer")
+parser.add_argument("--rank", type=int, default=128, help="Rank of the sub-space for LoRA and GaLore")
+parser.add_argument("--galore_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates")
+parser.add_argument("--galore_T", type=int, default=200, help="Sub-space change frequency")
+parser.add_argument("--lora_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates")
+parser.add_argument("--lora_dropout", type=float, default=0.1, help="Dropout rate for LoRA")
+
 parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant")
-parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
+parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer")
+parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing")
+parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up")
+
 parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
 parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs")
 parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens")
 parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens")
 parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)")
 parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working
-parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer")
-parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing")
-parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up")
-parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer")
 # parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler")
-parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file")
-parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file")
+parser.add_argument("--lora_config", type=str, default=None, help="Path to LoRa config file")
+parser.add_argument("--galore_config", type=str, default=None, help="Path to GaLore config file")
+
 parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode")
 args = parser.parse_args()
diff --git a/load_data.py b/load_data.py
index 028a2a1..1a7ed48 100644
--- a/load_data.py
+++ b/load_data.py
@@ -1,6 +1,7 @@
 import torch
 from datasets import load_dataset
 
+
 def load_data(args, tokenizer):
     if args.mode == "pretraining":
         return load_data_pretrain(args, tokenizer)
@@ -9,6 +10,7 @@ def load_data(args, tokenizer):
     else:
         raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
 
+
 def load_data_pretrain(args, tokenizer):
     dataset = load_dataset("allenai/c4", "realnewslike", streaming=True, split="train")
     dataset = dataset.take(args.num_training_tokens)
@@ -25,8 +27,21 @@ def load_data_pretrain(args, tokenizer):
 
     return dataset
 
+
 def load_data_finetune(args, tokenizer):
-    dataset = load_dataset("glue", "sst2")
+    arg_map = {
+        "glue_mnli": ("glue", "mnli"),
+        "glue_sst-2": ("glue", "sst2"),
+        "glue_mrpc": ("glue", "mrpc"),
+        "glue_cola": ("glue", "cola"),
+        "glue_qnli": ("glue", "qnli"),
+        "glue_qqp": ("glue", "qqp"),
+        "glue_rte": ("glue", "rte"),
+        "glue_sts-b": ("glue", "stsb"),
+    }
+    if args.dataset not in arg_map:
+        raise ValueError(f"Data set '{args.dataset}' not supported for mode 'finetuning'!")
+    dataset = load_dataset(*arg_map[args.dataset])
 
     def tokenize_function_finetune(batch):
         return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=args.max_length)
diff --git a/load_optimizers.py b/load_optimizers.py
index cde4e1d..08a91cf 100644
--- a/load_optimizers.py
+++ b/load_optimizers.py
@@ -6,8 +6,17 @@ import json
 
 def load_lora_config(args):
     """Loads LoRa configuration from file"""
-    with open(args.lora_config, "r") as f:
-        lora_params = json.load(f)
+    if args.lora_config is not None:
+        with open(args.lora_config, "r") as f:
+            lora_params = json.load(f)
+    else:
+        lora_params = {
+            "r": args.rank,
+            "lora_alpha": args.lora_alpha,
+            "lora_dropout": args.lora_dropout,
+            "target_modules_finetuning": ["query", "value"],
+            "target_modules_pretraining": ["q_proj", "v_proj"]
+        }
 
     target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params[
         "target_modules_pretraining"]
@@ -22,14 +31,21 @@ def load_lora_config(args):
 
 def load_galore_config(args):
     """Loads GaLore configuration from file"""
-    with open(args.galore_config, "r") as f:
-        return json.load(f)
+    if args.galore_config is not None:
+        with open(args.galore_config, "r") as f:
+            return json.load(f)
+    else:
+        return {
+            "rank": args.rank,
+            "update_proj_gap": args.galore_T,
+            "scale": args.galore_alpha,
+            "proj_type": "std"
+        }
 
 
 def get_optimizer(args, model):
     """Creates optimizer (GaLore, LoRa, or baseline AdamW)"""
     default_lr = 1.0  # Will be scheduled by LRScheduler
-    # TODO What to do with weight_decay for AdamW?
 
     if args.optimizer == "baseline":
         return AdamW(model.parameters(), lr=default_lr, weight_decay=args.weight_decay), model
diff --git a/scripts/shell/finetune_roberta_galore_cola.sh b/scripts/shell/finetune_roberta_galore_cola.sh
new file mode 100644
index 0000000..c8a08a4
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_cola.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_cola \
+      --batch_size 32 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_mnli.sh b/scripts/shell/finetune_roberta_galore_mnli.sh
new file mode 100755
index 0000000..08e9d17
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_mnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_mrpc.sh b/scripts/shell/finetune_roberta_galore_mrpc.sh
new file mode 100644
index 0000000..adc15ad
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_mrpc.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mrpc \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_qnli.sh b/scripts/shell/finetune_roberta_galore_qnli.sh
new file mode 100644
index 0000000..5353903
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_qnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_qqp.sh b/scripts/shell/finetune_roberta_galore_qqp.sh
new file mode 100644
index 0000000..1380838
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_qqp.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qqp \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_rte.sh b/scripts/shell/finetune_roberta_galore_rte.sh
new file mode 100644
index 0000000..740a035
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_rte.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_rte \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_sst-2.sh b/scripts/shell/finetune_roberta_galore_sst-2.sh
new file mode 100755
index 0000000..c9b5b45
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_sst-2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sst-2 \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_cola.sh b/scripts/shell/finetune_roberta_lora_cola.sh
new file mode 100644
index 0000000..322389a
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_cola.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_cola \
+      --batch_size 32 \
+      --num_epochs 80 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_mnli.sh b/scripts/shell/finetune_roberta_lora_mnli.sh
new file mode 100755
index 0000000..cf2210c
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_mnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_mrpc.sh b/scripts/shell/finetune_roberta_lora_mrpc.sh
new file mode 100644
index 0000000..fda767a
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_mrpc.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mrpc \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_qnli.sh b/scripts/shell/finetune_roberta_lora_qnli.sh
new file mode 100644
index 0000000..3380926
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_qnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qnli \
+      --batch_size 32 \
+      --num_epochs 25 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_qqp.sh b/scripts/shell/finetune_roberta_lora_qqp.sh
new file mode 100644
index 0000000..d95a220
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_qqp.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qqp \
+      --batch_size 16 \
+      --num_epochs 25 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_rte.sh b/scripts/shell/finetune_roberta_lora_rte.sh
new file mode 100644
index 0000000..3e9e35e
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_rte.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_rte \
+      --batch_size 32 \
+      --num_epochs 80 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_sst-2.sh b/scripts/shell/finetune_roberta_lora_sst-2.sh
new file mode 100755
index 0000000..024c506
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_sst-2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sst-2 \
+      --batch_size 16 \
+      --num_epochs 60 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_sts-b.sh b/scripts/shell/finetune_roberta_lora_sts-b.sh
new file mode 100644
index 0000000..704549d
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_sts-b.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sts-b \
+      --batch_size 16 \
+      --num_epochs 40 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh
index 77432d9..f7ae642 100755
--- a/scripts/shell/pretrain_60m.sh
+++ b/scripts/shell/pretrain_60m.sh
@@ -1,18 +1,27 @@
 #!/bin/bash
 
-python3 main.py \
-    --mode pretraining \
-    --optimizer galore \
-    --model llama_60m \
-    --batch_size 512 \
-    --num_epochs 1 \
-    --num_training_tokens 1310000000 \
-    --max_length 256 \
-    --shuffle false \
-    --dtype bf16 \
-    --lr_scheduler cosine \
-    --lr 1e-4 \
-    --lr_min 1e-5 \
-    --warm_up_fraction 0.1 \
-    --weight_decay 0 \
-    --test false
\ No newline at end of file
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode pretraining \
+      --optimizer "$optimizer" \
+      --model llama_60m \
+      --batch_size 512 \
+      --num_epochs 1 \
+      --num_training_tokens 1310000000 \
+      --max_length 256 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler cosine \
+      --lr 1e-4 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0.1 \
+      --weight_decay 0 \
+      --rank 128 \
+      --galore_alpha 0.25 \
+      --galore_T 200 \
+      --lora_alpha 32 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh
index 4b04554..2421458 100644
--- a/scripts/shell/pretrain_7b.sh
+++ b/scripts/shell/pretrain_7b.sh
@@ -1,18 +1,27 @@
 #!/bin/bash
 
-python3 main.py \
-    --mode pretraining \
-    --optimizer galore8bit \
-    --model llama_7b \
-    --batch_size 512 \
-    --num_epochs 1 \
-    --num_training_tokens 13100000 \
-    --max_length 256 \
-    --shuffle false \
-    --dtype bf16 \
-    --lr_scheduler cosine \
-    --lr 1e-4 \
-    --lr_min 1e-5 \
-    --warm_up_fraction 0.1 \
-    --weight_decay 0 \
-    --test false
\ No newline at end of file
+optimizers=("galore8bit")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode pretraining \
+      --optimizer "$optimizer" \
+      --model llama_7b \
+      --batch_size 512 \
+      --num_epochs 1 \
+      --num_training_tokens 13100000 \
+      --max_length 256 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler cosine \
+      --lr 1e-4 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0.1 \
+      --weight_decay 0 \
+      --rank 128 \
+      --galore_alpha 0.25 \
+      --galore_T 200 \
+      --lora_alpha 32 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
-- 
GitLab