diff --git a/args.py b/args.py index 8955eaccf6f49a3137bc04ddb5edc032a74a0b1e..6f40cf45b72320e265523799b08b927e7a2fb112 100644 --- a/args.py +++ b/args.py @@ -6,7 +6,7 @@ parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8 parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant") parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") parser.add_argument("--batch_size", type=int, default=16, help="Batch size") -parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs") +parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs") parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens") parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens") parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)") diff --git a/main.py b/main.py index b0773489cfbac6b38bb37ae628aa278c23f48b6f..3972801d6776c456ba62ec87fbe729113ab6e691 100644 --- a/main.py +++ b/main.py @@ -66,6 +66,7 @@ if __name__ == "__main__": print(f"Running on: {device}") print(f"Using optimizer: {args.optimizer}") + print(f"Arguments: {args}") init_csv() model, tokenizer = get_model(args) @@ -82,7 +83,8 @@ if __name__ == "__main__": optimizer, model = get_optimizer(args, model) - num_steps = ceil(args.num_epochs * len(dataloader)) + num_batches = len(dataloader) if args.mode == "finetuning" else ceil(args.num_training_tokens / args.batch_size) + num_steps = args.num_epochs * num_batches scheduler = get_scheduler( optimizer, args.lr_scheduler, args.warm_up_fraction, num_steps, args.lr, args.lr_min ) diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh new file mode 100755 index 0000000000000000000000000000000000000000..dc6a2e00118c683792140bfc9477d7b7e71e1295 --- /dev/null +++ b/scripts/shell/pretrain_60m.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +python3 main.py \ + --mode pretraining \ + --optimizer galore \ + --model llama_60m \ + --batch_size 8 \ + --num_epochs 1 \ + --num_training_tokens 10000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --test false \ No newline at end of file diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh new file mode 100644 index 0000000000000000000000000000000000000000..5be0e2ad7368872be27f4dc746baf7cee543928e --- /dev/null +++ b/scripts/shell/pretrain_7b.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +python3 main.py \ + --mode pretraining \ + --optimizer galore8bit \ + --model llama_7b \ + --batch_size 131000 \ + --num_epochs 1 \ + --num_training_tokens 13100000 \ + --max_length 256 \ + --shuffle false \ + --dtype bf16 \ + --lr_scheduler cosine \ + --lr 1e-4 \ + --lr_min 1e-5 \ + --warm_up_fraction 0.1 \ + --weight_decay 0 \ + --test false \ No newline at end of file