diff --git a/scripts/shell/pretrain_60m.sh b/scripts/shell/pretrain_60m.sh index dc6a2e00118c683792140bfc9477d7b7e71e1295..77432d904b31a0aeb06858d495e56f0faea216a3 100755 --- a/scripts/shell/pretrain_60m.sh +++ b/scripts/shell/pretrain_60m.sh @@ -4,9 +4,9 @@ python3 main.py \ --mode pretraining \ --optimizer galore \ --model llama_60m \ - --batch_size 8 \ + --batch_size 512 \ --num_epochs 1 \ - --num_training_tokens 10000 \ + --num_training_tokens 1310000000 \ --max_length 256 \ --shuffle false \ --dtype bf16 \ diff --git a/scripts/shell/pretrain_7b.sh b/scripts/shell/pretrain_7b.sh index 5be0e2ad7368872be27f4dc746baf7cee543928e..4b0455460b73bcf0a5075e57b1d09ac2ef29fc13 100644 --- a/scripts/shell/pretrain_7b.sh +++ b/scripts/shell/pretrain_7b.sh @@ -4,7 +4,7 @@ python3 main.py \ --mode pretraining \ --optimizer galore8bit \ --model llama_7b \ - --batch_size 131000 \ + --batch_size 512 \ --num_epochs 1 \ --num_training_tokens 13100000 \ --max_length 256 \