diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/galore-replication (GitLab).iml b/.idea/galore-replication (GitLab).iml new file mode 100644 index 0000000000000000000000000000000000000000..8e5446ac9594d6e198c2a2923123566d13b94bf9 --- /dev/null +++ b/.idea/galore-replication (GitLab).iml @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="PYTHON_MODULE" version="4"> + <component name="NewModuleRootManager"> + <content url="file://$MODULE_DIR$"> + <excludeFolder url="file://$MODULE_DIR$/venv" /> + </content> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> + <component name="PyDocumentationSettings"> + <option name="format" value="PLAIN" /> + <option name="myDocStringFormat" value="Plain" /> + </component> +</module> \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ +<component name="InspectionProjectProfileManager"> + <settings> + <option name="USE_PROJECT_PROFILE" value="false" /> + <version value="1.0" /> + </settings> +</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..ad209b915587c6302511b683f56709953b0a8d6e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (galore-replication (GitLab))" project-jdk-type="Python SDK" /> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..0032debb3554a66440b67864b6ce0b6cbf10b66f --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/galore-replication (GitLab).iml" filepath="$PROJECT_DIR$/.idea/galore-replication (GitLab).iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/args.py b/args.py index 873e4ff9d8b6ef20a17b9e707d4395b42442b38e..8955eaccf6f49a3137bc04ddb5edc032a74a0b1e 100644 --- a/args.py +++ b/args.py @@ -3,6 +3,7 @@ import argparse parser = argparse.ArgumentParser(description="Run training") parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use") parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use") +parser.add_argument("--lr_scheduler", type=str, choices=["constant", "linear", "cosine"], default="constant") parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") parser.add_argument("--batch_size", type=int, default=16, help="Batch size") parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs") @@ -11,8 +12,10 @@ parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)") parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer") +parser.add_argument("--lr_min", type=float, default=0, help="Minimum learning rate for annealing") +parser.add_argument("--warm_up_fraction", type=float, default=0, help="Fraction of training steps to use maximum learning rate as a warm-up") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer") -parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler") +# parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler") parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file") parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file") parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode") diff --git a/load_lr_scheduler.py b/load_lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..c0a4486858aeef17eac70cdc5d77210ac025e749 --- /dev/null +++ b/load_lr_scheduler.py @@ -0,0 +1,28 @@ +from torch.optim.lr_scheduler import ( + CosineAnnealingLR, LinearLR, ConstantLR, SequentialLR +) + + +def get_scheduler( + optimizer, + scheduler_type: str, + warm_up_fraction: float, + num_steps: int, + max_lr: float, + min_lr: float = 0.0, +): + warm_up_steps = int(warm_up_fraction * num_steps) + annealing_steps = num_steps - warm_up_steps + + warm_up_scheduler = ConstantLR(optimizer, 1.0, warm_up_steps) + + if scheduler_type == "constant": + annealing_scheduler = ConstantLR(optimizer, max_lr, annealing_steps) + elif scheduler_type == "linear": + annealing_scheduler = LinearLR(optimizer, max_lr, min_lr, annealing_steps) + elif scheduler_type == "cosine": + annealing_scheduler = CosineAnnealingLR(optimizer, annealing_steps, min_lr) + else: + raise ValueError(f"Scheduler option '{scheduler_type}' unknown. Use --help to get all possible options.") + + return SequentialLR(optimizer, [warm_up_scheduler, annealing_scheduler], [warm_up_steps],) diff --git a/load_models.py b/load_models.py new file mode 100644 index 0000000000000000000000000000000000000000..4ae519380916818030caa017d450f0d7e456d725 --- /dev/null +++ b/load_models.py @@ -0,0 +1,44 @@ +from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer +import torch + + +def get_model(args): + """ Creates model for Pretraining or Fine-Tuning """ + if args.mode == "pretraining": + model_config = AutoConfig.from_pretrained(f"config/{args.model}.json") + if args.dtype == "bf16": + model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16) + else: + model = AutoModelForCausalLM.from_config(model_config) + + # in the galore project they say: + # "it doesn't matter which tokenizer we use, because we train from scratch + # T5 tokenizer was trained on C4 and we are also training on C4, so it's a good choice" + tokenizer = AutoTokenizer.from_pretrained("t5-base") + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + if model.config.pad_token_id is None or model.config.pad_token_id == -1: + model.config.pad_token_id = tokenizer.pad_token_id + model.generation_config.pad_token_id = model.config.pad_token_id + + elif args.mode == "finetuning": + if args.model == "roberta": + if args.dtype == "bf16": + model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, + torch_dtype=torch.bfloat16) + else: + model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2) + tokenizer = AutoTokenizer.from_pretrained("roberta-base") + elif args.model == "gpt2": + model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2) + tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left") + + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = tokenizer.pad_token_id + else: + raise ValueError("Invalid model name. Choose 'roberta' or 'gpt2'") + else: + raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + + return model, tokenizer diff --git a/load_optimizers.py b/load_optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..cde4e1dba1787cbfe6d49c053eb83cc44ec98735 --- /dev/null +++ b/load_optimizers.py @@ -0,0 +1,60 @@ +from galore_torch import GaLoreAdamW, GaLoreAdamW8bit +from peft import LoraConfig, get_peft_model +from torch.optim import AdamW +import json + + +def load_lora_config(args): + """Loads LoRa configuration from file""" + with open(args.lora_config, "r") as f: + lora_params = json.load(f) + + target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params[ + "target_modules_pretraining"] + + return LoraConfig( + r=lora_params["r"], + lora_alpha=lora_params["lora_alpha"], + lora_dropout=lora_params["lora_dropout"], + target_modules=target_modules + ) + + +def load_galore_config(args): + """Loads GaLore configuration from file""" + with open(args.galore_config, "r") as f: + return json.load(f) + + +def get_optimizer(args, model): + """Creates optimizer (GaLore, LoRa, or baseline AdamW)""" + default_lr = 1.0 # Will be scheduled by LRScheduler + # TODO What to do with weight_decay for AdamW? + + if args.optimizer == "baseline": + return AdamW(model.parameters(), lr=default_lr, weight_decay=args.weight_decay), model + elif args.optimizer in ["galore", "galore8bit"]: + galore_config = load_galore_config(args) + trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] + + param_groups = [ + {"params": trainable_params, **galore_config} + ] + optimizer_class = GaLoreAdamW if args.optimizer == "galore" else GaLoreAdamW8bit + return optimizer_class(param_groups, lr=default_lr, weight_decay=args.weight_decay), model + elif args.optimizer in ["lora", "lora+galore8bit"]: + lora_config = load_lora_config(args) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + + if args.optimizer == "lora": + return AdamW(model.parameters(), lr=args.lr), model + else: + galore_config = load_galore_config() + trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] + param_groups = [ + {"params": trainable_params, **galore_config} + ] + return GaLoreAdamW8bit(param_groups, lr=default_lr, weight_decay=args.weight_decay), model + else: + raise ValueError(f"Unknown optimizer: {args.optimizer}") \ No newline at end of file diff --git a/main.py b/main.py index edc326f04addfdbd8404df6575d2c8cb294194cd..7342f3dc53d25b3cde0bdd4f9f7b51648cf597e6 100644 --- a/main.py +++ b/main.py @@ -1,104 +1,16 @@ from load_data import load_data -from galore_torch import GaLoreAdamW, GaLoreAdamW8bit +from load_models import get_model +from load_optimizers import get_optimizer +from load_lr_scheduler import get_scheduler + +from math import ceil from logger import init_csv, log_to_csv from accelerate import Accelerator -from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer import torch from torch.utils.data import DataLoader from args import args -from peft import LoraConfig, get_peft_model -from torch.optim import AdamW -import json import datetime -def get_model(args): - """ Creates model for Pretraining or Fine-Tuning """ - if args.mode == "pretraining": - model_config = AutoConfig.from_pretrained(f"config/{args.model}.json") - if args.dtype == "bf16": - model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16) - else: - model = AutoModelForCausalLM.from_config(model_config) - - # in the galore project they say: - # "it doesn't matter which tokenizer we use, because we train from scratch - # T5 tokenizer was trained on C4 and we are also training on C4, so it's a good choice" - tokenizer = AutoTokenizer.from_pretrained("t5-base") - - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id - if model.config.pad_token_id is None or model.config.pad_token_id == -1: - model.config.pad_token_id = tokenizer.pad_token_id - model.generation_config.pad_token_id = model.config.pad_token_id - - elif args.mode == "finetuning": - if args.model == "roberta": - if args.dtype == "bf16": - model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, torch_dtype=torch.bfloat16) - else: - model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2) - tokenizer = AutoTokenizer.from_pretrained("roberta-base") - elif args.model == "gpt2": - model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2) - tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side = "left") - - tokenizer.pad_token = tokenizer.eos_token - model.config.pad_token_id = tokenizer.pad_token_id - else: - raise ValueError("Invalid model name. Choose 'roberta' or 'gpt2'") - else: - raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") - - return model, tokenizer - -def load_lora_config(args): - """Loads LoRa configuration from file""" - with open(args.lora_config, "r") as f: - lora_params = json.load(f) - - target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params["target_modules_pretraining"] - - return LoraConfig( - r=lora_params["r"], - lora_alpha=lora_params["lora_alpha"], - lora_dropout=lora_params["lora_dropout"], - target_modules=target_modules - ) - -def load_galore_config(args): - """Loads GaLore configuration from file""" - with open(args.galore_config, "r") as f: - return json.load(f) - -def get_optimizer(args, model): - """Creates optimizer (GaLore, LoRa, or baseline AdamW)""" - if args.optimizer == "baseline": - return AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay), model - elif args.optimizer in ["galore", "galore8bit"]: - galore_config = load_galore_config(args) - trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] - - param_groups = [ - {"params": trainable_params, **galore_config} - ] - optimizer_class = GaLoreAdamW if args.optimizer == "galore" else GaLoreAdamW8bit - return optimizer_class(param_groups, lr=args.lr, weight_decay=args.weight_decay), model - elif args.optimizer in ["lora", "lora+galore8bit"]: - lora_config = load_lora_config(args) - model = get_peft_model(model, lora_config) - model.print_trainable_parameters() - - if args.optimizer == "lora": - return AdamW(model.parameters(), lr=args.lr), model - else: - galore_config = load_galore_config() - trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] - param_groups = [ - {"params": trainable_params, **galore_config} - ] - return GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay), model - else: - raise ValueError(f"Unknown optimizer: {args.optimizer}") def train(device, accelerator, scheduler, model, optimizer, dataloader, num_epochs): """ training model """ @@ -142,8 +54,9 @@ def train(device, accelerator, scheduler, model, optimizer, dataloader, num_epoc return model + if __name__ == "__main__": - if (args.test == "true"): + if args.test == "true": print("Test mode") accelerator = Accelerator() else: @@ -160,7 +73,11 @@ if __name__ == "__main__": dataset = load_data(args, tokenizer) optimizer, model = get_optimizer(args, model) - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.tmax) + + num_steps = ceil(args.epochs * len(dataset) / args.batch_size) + scheduler = get_scheduler( + optimizer, args.lr_scheduler, args.warm_up_fraction, num_steps, args.lr, args.lr_min + ) shuffle = True if args.shuffle == "true" else False if args.mode == "pretraining":