diff --git a/args.py b/args.py
index 6f40cf45b72320e265523799b08b927e7a2fb112..70393b6ca3800d592a3bdd8def6a862a71630c6d 100644
--- a/args.py
+++ b/args.py
@@ -2,21 +2,31 @@ import argparse
 
 parser = argparse.ArgumentParser(description="Run training")
 parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use")
+parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
+parser.add_argument("--dataset", type=str, choices=["c4", "glue_mnli", "glue_sst-2", "glue_mrpc", "glue_cola", "glue_qnli", "glue_qqp", "glue_rte", "glue_sts-b"], required=True, help="Model to use")
+
 parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use")
+parser.add_argument("--weight_decay", type=float, default=0, help="Weight decay for optimizer")
+parser.add_argument("--rank", type=int, default=128, help="Rank of the sub-space for LoRA and GaLore")
+parser.add_argument("--galore_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates")
+parser.add_argument("--galore_T", type=int, default=200, help="Sub-space change frequency")
+parser.add_argument("--lora_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates")
+parser.add_argument("--lora_dropout", type=float, default=0.1, help="Dropout rate for LoRA")
+
 parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant")
-parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
+parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer")
+parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing")
+parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up")
+
 parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
 parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs")
 parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens")
 parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens")
 parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)")
 parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working
-parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer")
-parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing")
-parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up")
-parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer")
 # parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler")
-parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file")
-parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file")
+parser.add_argument("--lora_config", type=str, default=None, help="Path to LoRa config file")
+parser.add_argument("--galore_config", type=str, default=None, help="Path to GaLore config file")
+
 parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode")
 args = parser.parse_args()
diff --git a/load_data.py b/load_data.py
index 028a2a14abf29d79a122e82c05c5052b62d96abc..1a7ed4869ddba45ec635490bdb66d1a8cbd5f2b9 100644
--- a/load_data.py
+++ b/load_data.py
@@ -1,6 +1,7 @@
 import torch
 from datasets import load_dataset
 
+
 def load_data(args, tokenizer):
     if args.mode == "pretraining":
         return load_data_pretrain(args, tokenizer)
@@ -9,6 +10,7 @@ def load_data(args, tokenizer):
     else:
         raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
 
+
 def load_data_pretrain(args, tokenizer):
     dataset = load_dataset("allenai/c4", "realnewslike", streaming=True, split="train")
     dataset = dataset.take(args.num_training_tokens)
@@ -25,8 +27,21 @@ def load_data_pretrain(args, tokenizer):
 
     return dataset
 
+
 def load_data_finetune(args, tokenizer):
-    dataset = load_dataset("glue", "sst2")
+    arg_map = {
+        "glue_mnli": ("glue", "mnli"),
+        "glue_sst-2": ("glue", "sst2"),
+        "glue_mrpc": ("glue", "mrpc"),
+        "glue_cola": ("glue", "cola"),
+        "glue_qnli": ("glue", "qnli"),
+        "glue_qqp": ("glue", "qqp"),
+        "glue_rte": ("glue", "rte"),
+        "glue_sts-b": ("glue", "stsb"),
+    }
+    if args.dataset not in arg_map:
+        raise ValueError(f"Data set '{args.dataset}' not supported for mode 'finetuning'!")
+    dataset = load_dataset(*arg_map[args.dataset])
 
     def tokenize_function_finetune(batch):
         return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=args.max_length)
diff --git a/load_optimizers.py b/load_optimizers.py
index cde4e1dba1787cbfe6d49c053eb83cc44ec98735..08a91cf0a8ddb047007509f90ebca239167e6a77 100644
--- a/load_optimizers.py
+++ b/load_optimizers.py
@@ -6,8 +6,17 @@ import json
 
 def load_lora_config(args):
     """Loads LoRa configuration from file"""
-    with open(args.lora_config, "r") as f:
-        lora_params = json.load(f)
+    if args.lora_config is not None:
+        with open(args.lora_config, "r") as f:
+            lora_params = json.load(f)
+    else:
+        lora_params = {
+            "r": args.rank,
+            "lora_alpha": args.lora_alpha,
+            "lora_dropout": args.lora_dropout,
+            "target_modules_finetuning": ["query", "value"],
+            "target_modules_pretraining": ["q_proj", "v_proj"]
+        }
 
     target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params[
         "target_modules_pretraining"]
@@ -22,14 +31,21 @@ def load_lora_config(args):
 
 def load_galore_config(args):
     """Loads GaLore configuration from file"""
-    with open(args.galore_config, "r") as f:
-        return json.load(f)
+    if args.galore_config is not None:
+        with open(args.galore_config, "r") as f:
+            return json.load(f)
+    else:
+        return {
+            "rank": args.rank,
+            "update_proj_gap": args.galore_T,
+            "scale": args.galore_alpha,
+            "proj_type": "std"
+        }
 
 
 def get_optimizer(args, model):
     """Creates optimizer (GaLore, LoRa, or baseline AdamW)"""
     default_lr = 1.0  # Will be scheduled by LRScheduler
-    # TODO What to do with weight_decay for AdamW?
 
     if args.optimizer == "baseline":
         return AdamW(model.parameters(), lr=default_lr, weight_decay=args.weight_decay), model
diff --git a/scripts/shell/finetune_roberta_galore_cola.sh b/scripts/shell/finetune_roberta_galore_cola.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c8a08a4c7388d8822279bea11090c150e57935c7
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_cola.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_cola \
+      --batch_size 32 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_mnli.sh b/scripts/shell/finetune_roberta_galore_mnli.sh
new file mode 100755
index 0000000000000000000000000000000000000000..08e9d17d3da64c7be974a462fcad70ef73c6d19f
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_mnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_mrpc.sh b/scripts/shell/finetune_roberta_galore_mrpc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..adc15ad386888791f6d3b498a9f823b3381dbe3e
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_mrpc.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mrpc \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_qnli.sh b/scripts/shell/finetune_roberta_galore_qnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..53539038ac18ade25f02cc71ec6bbbfc7c914116
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_qnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 1e-5 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_qqp.sh b/scripts/shell/finetune_roberta_galore_qqp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..13808381648f3c4f42df06536089777809f50a72
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_qqp.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qqp \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_rte.sh b/scripts/shell/finetune_roberta_galore_rte.sh
new file mode 100644
index 0000000000000000000000000000000000000000..740a035f9b68bf6542d8317791541458cf3d7931
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_rte.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_rte \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_galore_sst-2.sh b/scripts/shell/finetune_roberta_galore_sst-2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c9b5b450817c03ee30f4dde705855553e5226ac7
--- /dev/null
+++ b/scripts/shell/finetune_roberta_galore_sst-2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sst-2 \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler constant \
+      --lr 2e-5 \
+      --lr_min 2e-5 \
+      --warm_up_fraction 0 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_cola.sh b/scripts/shell/finetune_roberta_lora_cola.sh
new file mode 100644
index 0000000000000000000000000000000000000000..322389ab88170054ac4825345ac811562d2d7b5d
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_cola.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_cola \
+      --batch_size 32 \
+      --num_epochs 80 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_mnli.sh b/scripts/shell/finetune_roberta_lora_mnli.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cf2210c12168397e648b64a87a10e45276873050
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_mnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mnli \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_mrpc.sh b/scripts/shell/finetune_roberta_lora_mrpc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fda767a9ed8a9a8b3b1994fdbee21b221912640f
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_mrpc.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_mrpc \
+      --batch_size 16 \
+      --num_epochs 30 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_qnli.sh b/scripts/shell/finetune_roberta_lora_qnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..33809266f66bb95f32296d07def61bff363de030
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_qnli.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qnli \
+      --batch_size 32 \
+      --num_epochs 25 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_qqp.sh b/scripts/shell/finetune_roberta_lora_qqp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d95a220cdae80012d1963943d35542c4915d324e
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_qqp.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_qqp \
+      --batch_size 16 \
+      --num_epochs 25 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_rte.sh b/scripts/shell/finetune_roberta_lora_rte.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3e9e35ef8f78f7dd6a0197bd1e48546cf2636f1e
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_rte.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_rte \
+      --batch_size 32 \
+      --num_epochs 80 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_sst-2.sh b/scripts/shell/finetune_roberta_lora_sst-2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..024c506d94c3ec6741077af53a8eb5ef26d914bb
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_sst-2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sst-2 \
+      --batch_size 16 \
+      --num_epochs 60 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 5e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/finetune_roberta_lora_sts-b.sh b/scripts/shell/finetune_roberta_lora_sts-b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..704549d49f0ac6cc02ed05bab334675206398121
--- /dev/null
+++ b/scripts/shell/finetune_roberta_lora_sts-b.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode finetuning \
+      --optimizer "$optimizer" \
+      --model roberta \
+      --dataset glue_sts-b \
+      --batch_size 16 \
+      --num_epochs 40 \
+      --max_length 512 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler linear \
+      --lr 4e-4 \
+      --lr_min 0.01 \
+      --warm_up_fraction 0.06 \
+      --weight_decay 0 \
+      --rank 8 \
+      --galore_alpha 2 \
+      --galore_T 200 \
+      --lora_alpha 8 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh
index 77432d904b31a0aeb06858d495e56f0faea216a3..f7ae642413e81c0caefd8e2b751fe3bb4ddb96a4 100755
--- a/scripts/shell/pretrain_60m.sh
+++ b/scripts/shell/pretrain_60m.sh
@@ -1,18 +1,27 @@
 #!/bin/bash
 
-python3 main.py \
-    --mode pretraining \
-    --optimizer galore \
-    --model llama_60m \
-    --batch_size 512 \
-    --num_epochs 1 \
-    --num_training_tokens 1310000000 \
-    --max_length 256 \
-    --shuffle false \
-    --dtype bf16 \
-    --lr_scheduler cosine \
-    --lr 1e-4 \
-    --lr_min 1e-5 \
-    --warm_up_fraction 0.1 \
-    --weight_decay 0 \
-    --test false
\ No newline at end of file
+optimizers=("galore galore8bit lora lora+galore8bit baseline")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode pretraining \
+      --optimizer "$optimizer" \
+      --model llama_60m \
+      --batch_size 512 \
+      --num_epochs 1 \
+      --num_training_tokens 1310000000 \
+      --max_length 256 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler cosine \
+      --lr 1e-4 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0.1 \
+      --weight_decay 0 \
+      --rank 128 \
+      --galore_alpha 0.25 \
+      --galore_T 200 \
+      --lora_alpha 32 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file
diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh
index 4b0455460b73bcf0a5075e57b1d09ac2ef29fc13..242145856523abca8fa8894dcd4e9fe68d16eea3 100644
--- a/scripts/shell/pretrain_7b.sh
+++ b/scripts/shell/pretrain_7b.sh
@@ -1,18 +1,27 @@
 #!/bin/bash
 
-python3 main.py \
-    --mode pretraining \
-    --optimizer galore8bit \
-    --model llama_7b \
-    --batch_size 512 \
-    --num_epochs 1 \
-    --num_training_tokens 13100000 \
-    --max_length 256 \
-    --shuffle false \
-    --dtype bf16 \
-    --lr_scheduler cosine \
-    --lr 1e-4 \
-    --lr_min 1e-5 \
-    --warm_up_fraction 0.1 \
-    --weight_decay 0 \
-    --test false
\ No newline at end of file
+optimizers=("galore8bit")
+for optimizer in $optimizers
+do
+  python3 main.py \
+      --mode pretraining \
+      --optimizer "$optimizer" \
+      --model llama_7b \
+      --batch_size 512 \
+      --num_epochs 1 \
+      --num_training_tokens 13100000 \
+      --max_length 256 \
+      --shuffle false \
+      --dtype bf16 \
+      --lr_scheduler cosine \
+      --lr 1e-4 \
+      --lr_min 1e-5 \
+      --warm_up_fraction 0.1 \
+      --weight_decay 0 \
+      --rank 128 \
+      --galore_alpha 0.25 \
+      --galore_T 200 \
+      --lora_alpha 32 \
+      --lora_dropout 0.1 \
+      --test false
+done
\ No newline at end of file