From 5b2b84fa273db654fb2b78dde5503cc094b0f050 Mon Sep 17 00:00:00 2001 From: Riko Uphoff <riko.uphoff@student.uni-halle.de> Date: Sun, 30 Mar 2025 15:06:52 +0200 Subject: [PATCH] Added Galore and Lora parameters to args; Added Datasets to args; Added training scripts; --- args.py | 24 +++++++---- load_data.py | 17 +++++++- load_optimizers.py | 26 +++++++++--- scripts/shell/finetune_roberta_galore_cola.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_galore_mnli.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_galore_mrpc.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_galore_qnli.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_galore_qqp.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_galore_rte.sh | 27 ++++++++++++ .../shell/finetune_roberta_galore_sst-2.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_cola.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_mnli.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_mrpc.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_qnli.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_qqp.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_rte.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_sst-2.sh | 27 ++++++++++++ scripts/shell/finetune_roberta_lora_sts-b.sh | 27 ++++++++++++ scripts/shell/pretrain_60m.sh | 41 +++++++++++-------- scripts/shell/pretrain_7b.sh | 41 +++++++++++-------- 20 files changed, 509 insertions(+), 45 deletions(-) create mode 100644 scripts/shell/finetune_roberta_galore_cola.sh create mode 100755 scripts/shell/finetune_roberta_galore_mnli.sh create mode 100644 scripts/shell/finetune_roberta_galore_mrpc.sh create mode 100644 scripts/shell/finetune_roberta_galore_qnli.sh create mode 100644 scripts/shell/finetune_roberta_galore_qqp.sh create mode 100644 scripts/shell/finetune_roberta_galore_rte.sh create mode 100755 scripts/shell/finetune_roberta_galore_sst-2.sh create mode 100644 scripts/shell/finetune_roberta_lora_cola.sh create mode 100755 scripts/shell/finetune_roberta_lora_mnli.sh create mode 100644 scripts/shell/finetune_roberta_lora_mrpc.sh create mode 100644 scripts/shell/finetune_roberta_lora_qnli.sh create mode 100644 scripts/shell/finetune_roberta_lora_qqp.sh create mode 100644 scripts/shell/finetune_roberta_lora_rte.sh create mode 100755 scripts/shell/finetune_roberta_lora_sst-2.sh create mode 100644 scripts/shell/finetune_roberta_lora_sts-b.sh diff --git a/args.py b/args.py index 6f40cf4..70393b6 100644 --- a/args.py +++ b/args.py @@ -2,21 +2,31 @@ import argparse parser = argparse.ArgumentParser(description="Run training") parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use") +parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") +parser.add_argument("--dataset", type=str, choices=["c4", "glue_mnli", "glue_sst-2", "glue_mrpc", "glue_cola", "glue_qnli", "glue_qqp", "glue_rte", "glue_sts-b"], required=True, help="Model to use") + parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use") +parser.add_argument("--weight_decay", type=float, default=0, help="Weight decay for optimizer") +parser.add_argument("--rank", type=int, default=128, help="Rank of the sub-space for LoRA and GaLore") +parser.add_argument("--galore_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates") +parser.add_argument("--galore_T", type=int, default=200, help="Sub-space change frequency") +parser.add_argument("--lora_alpha", type=float, default=1.0, help="Scaling factor for optimizer updates") +parser.add_argument("--lora_dropout", type=float, default=0.1, help="Dropout rate for LoRA") + parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant") -parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") +parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer") +parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing") +parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up") + parser.add_argument("--batch_size", type=int, default=16, help="Batch size") parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs") parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens") parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens") parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)") parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working -parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer") -parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing") -parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up") -parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer") # parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler") -parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file") -parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file") +parser.add_argument("--lora_config", type=str, default=None, help="Path to LoRa config file") +parser.add_argument("--galore_config", type=str, default=None, help="Path to GaLore config file") + parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode") args = parser.parse_args() diff --git a/load_data.py b/load_data.py index 028a2a1..1a7ed48 100644 --- a/load_data.py +++ b/load_data.py @@ -1,6 +1,7 @@ import torch from datasets import load_dataset + def load_data(args, tokenizer): if args.mode == "pretraining": return load_data_pretrain(args, tokenizer) @@ -9,6 +10,7 @@ def load_data(args, tokenizer): else: raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + def load_data_pretrain(args, tokenizer): dataset = load_dataset("allenai/c4", "realnewslike", streaming=True, split="train") dataset = dataset.take(args.num_training_tokens) @@ -25,8 +27,21 @@ def load_data_pretrain(args, tokenizer): return dataset + def load_data_finetune(args, tokenizer): - dataset = load_dataset("glue", "sst2") + arg_map = { + "glue_mnli": ("glue", "mnli"), + "glue_sst-2": ("glue", "sst2"), + "glue_mrpc": ("glue", "mrpc"), + "glue_cola": ("glue", "cola"), + "glue_qnli": ("glue", "qnli"), + "glue_qqp": ("glue", "qqp"), + "glue_rte": ("glue", "rte"), + "glue_sts-b": ("glue", "stsb"), + } + if args.dataset not in arg_map: + raise ValueError(f"Data set '{args.dataset}' not supported for mode 'finetuning'!") + dataset = load_dataset(*arg_map[args.dataset]) def tokenize_function_finetune(batch): return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=args.max_length) diff --git a/load_optimizers.py b/load_optimizers.py index cde4e1d..08a91cf 100644 --- a/load_optimizers.py +++ b/load_optimizers.py @@ -6,8 +6,17 @@ import json def load_lora_config(args): """Loads LoRa configuration from file""" - with open(args.lora_config, "r") as f: - lora_params = json.load(f) + if args.lora_config is not None: + with open(args.lora_config, "r") as f: + lora_params = json.load(f) + else: + lora_params = { + "r": args.rank, + "lora_alpha": args.lora_alpha, + "lora_dropout": args.lora_dropout, + "target_modules_finetuning": ["query", "value"], + "target_modules_pretraining": ["q_proj", "v_proj"] + } target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params[ "target_modules_pretraining"] @@ -22,14 +31,21 @@ def load_lora_config(args): def load_galore_config(args): """Loads GaLore configuration from file""" - with open(args.galore_config, "r") as f: - return json.load(f) + if args.galore_config is not None: + with open(args.galore_config, "r") as f: + return json.load(f) + else: + return { + "rank": args.rank, + "update_proj_gap": args.galore_T, + "scale": args.galore_alpha, + "proj_type": "std" + } def get_optimizer(args, model): """Creates optimizer (GaLore, LoRa, or baseline AdamW)""" default_lr = 1.0 # Will be scheduled by LRScheduler - # TODO What to do with weight_decay for AdamW? if args.optimizer == "baseline": return AdamW(model.parameters(), lr=default_lr, weight_decay=args.weight_decay), model diff --git a/scripts/shell/finetune_roberta_galore_cola.sh b/scripts/shell/finetune_roberta_galore_cola.sh new file mode 100644 index 0000000..c8a08a4 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_cola.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_cola \ + --batch_size 32 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 1e-5 \ + --lr_min 1e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_mnli.sh b/scripts/shell/finetune_roberta_galore_mnli.sh new file mode 100755 index 0000000..08e9d17 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_mnli.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_mnli \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 1e-5 \ + --lr_min 1e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_mrpc.sh b/scripts/shell/finetune_roberta_galore_mrpc.sh new file mode 100644 index 0000000..adc15ad --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_mrpc.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_mrpc \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 2e-5 \ + --lr_min 2e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_qnli.sh b/scripts/shell/finetune_roberta_galore_qnli.sh new file mode 100644 index 0000000..5353903 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_qnli.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_qnli \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 1e-5 \ + --lr_min 1e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_qqp.sh b/scripts/shell/finetune_roberta_galore_qqp.sh new file mode 100644 index 0000000..1380838 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_qqp.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_qqp \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 2e-5 \ + --lr_min 2e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_rte.sh b/scripts/shell/finetune_roberta_galore_rte.sh new file mode 100644 index 0000000..740a035 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_rte.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_rte \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 2e-5 \ + --lr_min 2e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_galore_sst-2.sh b/scripts/shell/finetune_roberta_galore_sst-2.sh new file mode 100755 index 0000000..c9b5b45 --- /dev/null +++ b/scripts/shell/finetune_roberta_galore_sst-2.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_sst-2 \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler constant \ + --lr 2e-5 \ + --lr_min 2e-5 \ + --warm_up_fraction 0 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_cola.sh b/scripts/shell/finetune_roberta_lora_cola.sh new file mode 100644 index 0000000..322389a --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_cola.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_cola \ + --batch_size 32 \ + --num_epochs 80 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 4e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_mnli.sh b/scripts/shell/finetune_roberta_lora_mnli.sh new file mode 100755 index 0000000..cf2210c --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_mnli.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_mnli \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 5e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_mrpc.sh b/scripts/shell/finetune_roberta_lora_mrpc.sh new file mode 100644 index 0000000..fda767a --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_mrpc.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_mrpc \ + --batch_size 16 \ + --num_epochs 30 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 4e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_qnli.sh b/scripts/shell/finetune_roberta_lora_qnli.sh new file mode 100644 index 0000000..3380926 --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_qnli.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_qnli \ + --batch_size 32 \ + --num_epochs 25 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 4e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_qqp.sh b/scripts/shell/finetune_roberta_lora_qqp.sh new file mode 100644 index 0000000..d95a220 --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_qqp.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_qqp \ + --batch_size 16 \ + --num_epochs 25 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 5e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_rte.sh b/scripts/shell/finetune_roberta_lora_rte.sh new file mode 100644 index 0000000..3e9e35e --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_rte.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_rte \ + --batch_size 32 \ + --num_epochs 80 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 5e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_sst-2.sh b/scripts/shell/finetune_roberta_lora_sst-2.sh new file mode 100755 index 0000000..024c506 --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_sst-2.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_sst-2 \ + --batch_size 16 \ + --num_epochs 60 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 5e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/finetune_roberta_lora_sts-b.sh b/scripts/shell/finetune_roberta_lora_sts-b.sh new file mode 100644 index 0000000..704549d --- /dev/null +++ b/scripts/shell/finetune_roberta_lora_sts-b.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode finetuning \ + --optimizer "$optimizer" \ + --model roberta \ + --dataset glue_sts-b \ + --batch_size 16 \ + --num_epochs 40 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler linear \ + --lr 4e-4 \ + --lr_min 0.01 \ + --warm_up_fraction 0.06 \ + --weight_decay 0 \ + --rank 8 \ + --galore_alpha 2 \ + --galore_T 200 \ + --lora_alpha 8 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh index 77432d9..f7ae642 100755 --- a/scripts/shell/pretrain_60m.sh +++ b/scripts/shell/pretrain_60m.sh @@ -1,18 +1,27 @@ #!/bin/bash -python3 main.py \ - --mode pretraining \ - --optimizer galore \ - --model llama_60m \ - --batch_size 512 \ - --num_epochs 1 \ - --num_training_tokens 1310000000 \ - --max_length 256 \ - --shuffle false \ - --dtype bf16 \ - --lr_scheduler cosine \ - --lr 1e-4 \ - --lr_min 1e-5 \ - --warm_up_fraction 0.1 \ - --weight_decay 0 \ - --test false \ No newline at end of file +optimizers=("galore galore8bit lora lora+galore8bit baseline") +for optimizer in $optimizers +do + python3 main.py \ + --mode pretraining \ + --optimizer "$optimizer" \ + --model llama_60m \ + --batch_size 512 \ + --num_epochs 1 \ + --num_training_tokens 1310000000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --rank 128 \ + --galore_alpha 0.25 \ + --galore_T 200 \ + --lora_alpha 32 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh index 4b04554..2421458 100644 --- a/scripts/shell/pretrain_7b.sh +++ b/scripts/shell/pretrain_7b.sh @@ -1,18 +1,27 @@ #!/bin/bash -python3 main.py \ - --mode pretraining \ - --optimizer galore8bit \ - --model llama_7b \ - --batch_size 512 \ - --num_epochs 1 \ - --num_training_tokens 13100000 \ - --max_length 256 \ - --shuffle false \ - --dtype bf16 \ - --lr_scheduler cosine \ - --lr 1e-4 \ - --lr_min 1e-5 \ - --warm_up_fraction 0.1 \ - --weight_decay 0 \ - --test false \ No newline at end of file +optimizers=("galore8bit") +for optimizer in $optimizers +do + python3 main.py \ + --mode pretraining \ + --optimizer "$optimizer" \ + --model llama_7b \ + --batch_size 512 \ + --num_epochs 1 \ + --num_training_tokens 13100000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --rank 128 \ + --galore_alpha 0.25 \ + --galore_T 200 \ + --lora_alpha 32 \ + --lora_dropout 0.1 \ + --test false +done \ No newline at end of file -- GitLab