From 140033bea13ee55779c68ced361252434310b0cb Mon Sep 17 00:00:00 2001
From: Riko Uphoff <riko.uphoff@student.uni-halle.de>
Date: Sat, 29 Mar 2025 12:50:46 +0100
Subject: [PATCH] Fixed number of training steps for real; Adjusted default
 parameters;

---
 args.py                       |  2 +-
 main.py                       |  4 +++-
 scripts/shell/pretrain_60m.sh | 18 ++++++++++++++++++
 scripts/shell/pretrain_7b.sh  | 18 ++++++++++++++++++
 4 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100755 scripts/shell/pretrain_60m.sh
 create mode 100644 scripts/shell/pretrain_7b.sh

diff --git a/args.py b/args.py
index 8955eac..6f40cf4 100644
--- a/args.py
+++ b/args.py
@@ -6,7 +6,7 @@ parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8
 parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant")
 parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
 parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
-parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs")
+parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs")
 parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens")
 parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens")
 parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)")
diff --git a/main.py b/main.py
index b077348..3972801 100644
--- a/main.py
+++ b/main.py
@@ -66,6 +66,7 @@ if __name__ == "__main__":
 
     print(f"Running on: {device}")
     print(f"Using optimizer: {args.optimizer}")
+    print(f"Arguments: {args}")
     init_csv()
 
     model, tokenizer = get_model(args)
@@ -82,7 +83,8 @@ if __name__ == "__main__":
 
     optimizer, model = get_optimizer(args, model)
 
-    num_steps = ceil(args.num_epochs * len(dataloader))
+    num_batches = len(dataloader) if args.mode == "finetuning" else ceil(args.num_training_tokens / args.batch_size)
+    num_steps = args.num_epochs * num_batches
     scheduler = get_scheduler(
         optimizer, args.lr_scheduler, args.warm_up_fraction, num_steps, args.lr, args.lr_min
     )
diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh
new file mode 100755
index 0000000..dc6a2e0
--- /dev/null
+++ b/scripts/shell/pretrain_60m.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+python3 main.py \
+    --mode pretraining \
+    --optimizer galore \
+    --model llama_60m \
+    --batch_size 8 \
+    --num_epochs 1 \
+    --num_training_tokens 10000 \
+    --max_length 256 \
+    --shuffle false \
+    --dtype bf16 \
+    --lr_scheduler cosine \
+    --lr 1e-4 \
+    --lr_min 1e-5 \
+    --warm_up_fraction 0.1 \
+    --weight_decay 0 \
+    --test false
\ No newline at end of file
diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh
new file mode 100644
index 0000000..5be0e2a
--- /dev/null
+++ b/scripts/shell/pretrain_7b.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+python3 main.py \
+    --mode pretraining \
+    --optimizer galore8bit \
+    --model llama_7b \
+    --batch_size 131000 \
+    --num_epochs 1 \
+    --num_training_tokens 13100000 \
+    --max_length 256 \
+    --shuffle false \
+    --dtype bf16 \
+    --lr_scheduler cosine \
+    --lr 1e-4 \
+    --lr_min 1e-5 \
+    --warm_up_fraction 0.1 \
+    --weight_decay 0 \
+    --test false
\ No newline at end of file
-- 
GitLab