From 140033bea13ee55779c68ced361252434310b0cb Mon Sep 17 00:00:00 2001 From: Riko Uphoff <riko.uphoff@student.uni-halle.de> Date: Sat, 29 Mar 2025 12:50:46 +0100 Subject: [PATCH] Fixed number of training steps for real; Adjusted default parameters; --- args.py | 2 +- main.py | 4 +++- scripts/shell/pretrain_60m.sh | 18 ++++++++++++++++++ scripts/shell/pretrain_7b.sh | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) create mode 100755 scripts/shell/pretrain_60m.sh create mode 100644 scripts/shell/pretrain_7b.sh diff --git a/args.py b/args.py index 8955eac..6f40cf4 100644 --- a/args.py +++ b/args.py @@ -6,7 +6,7 @@ parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8 parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant") parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") parser.add_argument("--batch_size", type=int, default=16, help="Batch size") -parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs") +parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs") parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens") parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens") parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)") diff --git a/main.py b/main.py index b077348..3972801 100644 --- a/main.py +++ b/main.py @@ -66,6 +66,7 @@ if __name__ == "__main__": print(f"Running on: {device}") print(f"Using optimizer: {args.optimizer}") + print(f"Arguments: {args}") init_csv() model, tokenizer = get_model(args) @@ -82,7 +83,8 @@ if __name__ == "__main__": optimizer, model = get_optimizer(args, model) - num_steps = ceil(args.num_epochs * len(dataloader)) + num_batches = len(dataloader) if args.mode == "finetuning" else ceil(args.num_training_tokens / args.batch_size) + num_steps = args.num_epochs * num_batches scheduler = get_scheduler( optimizer, args.lr_scheduler, args.warm_up_fraction, num_steps, args.lr, args.lr_min ) diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh new file mode 100755 index 0000000..dc6a2e0 --- /dev/null +++ b/scripts/shell/pretrain_60m.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +python3 main.py \ + --mode pretraining \ + --optimizer galore \ + --model llama_60m \ + --batch_size 8 \ + --num_epochs 1 \ + --num_training_tokens 10000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --test false \ No newline at end of file diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh new file mode 100644 index 0000000..5be0e2a --- /dev/null +++ b/scripts/shell/pretrain_7b.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +python3 main.py \ + --mode pretraining \ + --optimizer galore8bit \ + --model llama_7b \ + --batch_size 131000 \ + --num_epochs 1 \ + --num_training_tokens 13100000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --test false \ No newline at end of file -- GitLab