From 6ca32c333bbf5ff52927431b360ced8dd361a97c Mon Sep 17 00:00:00 2001 From: Riko Uphoff <riko.uphoff@student.uni-halle.de> Date: Fri, 28 Mar 2025 15:35:11 +0100 Subject: [PATCH] Added source code and extended Dockerfile --- .gitignore | 5 + Dockerfile | 12 ++ args.py | 19 ++++ config/README.md | 4 + config/galore_config.json | 6 + config/llama_1b.json | 20 ++++ config/llama_60m.json | 20 ++++ config/llama_7b.json | 20 ++++ config/lora_config.json | 7 ++ load_data.py | 37 +++++++ logger.py | 38 +++++++ main.py | 178 ++++++++++++++++++++++++++++++ requirements.txt | 8 ++ run_glue_benchmark.py | 60 ++++++++++ scripts/shell/pretrain.sh | 16 +++ scripts/shell/test.sh | 16 +++ scripts/slurm/run_pretrain.sh | 19 ++++ scripts/windows/finetune_lora.bat | 15 +++ scripts/windows/test.bat | 15 +++ 19 files changed, 515 insertions(+) create mode 100644 .gitignore create mode 100644 args.py create mode 100644 config/README.md create mode 100644 config/galore_config.json create mode 100644 config/llama_1b.json create mode 100644 config/llama_60m.json create mode 100644 config/llama_7b.json create mode 100644 config/lora_config.json create mode 100644 load_data.py create mode 100644 logger.py create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 run_glue_benchmark.py create mode 100644 scripts/shell/pretrain.sh create mode 100644 scripts/shell/test.sh create mode 100644 scripts/slurm/run_pretrain.sh create mode 100644 scripts/windows/finetune_lora.bat create mode 100644 scripts/windows/test.bat diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e7d32d6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +test.py +output.txt +output.csv +models/ diff --git a/Dockerfile b/Dockerfile index d119cb3..355ff3e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1 +1,13 @@ FROM nvidia/cuda:12.3.0-devel-ubuntu20.04 + +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --upgrade pip + +COPY requirements.txt /workspace/requirements.txt +RUN pip3 install -r /workspace/requirements.txt + +WORKDIR /workspace diff --git a/args.py b/args.py new file mode 100644 index 0000000..873e4ff --- /dev/null +++ b/args.py @@ -0,0 +1,19 @@ +import argparse + +parser = argparse.ArgumentParser(description="Run training") +parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use") +parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use") +parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use") +parser.add_argument("--batch_size", type=int, default=16, help="Batch size") +parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs") +parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens") +parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens") +parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)") +parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working +parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer") +parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler") +parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file") +parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file") +parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode") +args = parser.parse_args() diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..88bb8ea --- /dev/null +++ b/config/README.md @@ -0,0 +1,4 @@ +# Copyright Notice + +The LLaMa config files are the ones used in the original [GaLore project](https://github.com/jiaweizzhao/GaLore).\ +Since we want to replicate the results, we're using exactly the same settings for the LLaMa models in our experiences. \ No newline at end of file diff --git a/config/galore_config.json b/config/galore_config.json new file mode 100644 index 0000000..59a1f65 --- /dev/null +++ b/config/galore_config.json @@ -0,0 +1,6 @@ +{ + "rank": 128, + "update_proj_gap": 200, + "scale": 0.25, + "proj_type": "std" +} diff --git a/config/llama_1b.json b/config/llama_1b.json new file mode 100644 index 0000000..2a068c1 --- /dev/null +++ b/config/llama_1b.json @@ -0,0 +1,20 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 1, + "hidden_act": "silu", + "hidden_size": 2048, + "intermediate_size": 5461, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 24, + "pad_token_id": -1, + "rms_norm_eps": 1e-06, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/config/llama_60m.json b/config/llama_60m.json new file mode 100644 index 0000000..edd4995 --- /dev/null +++ b/config/llama_60m.json @@ -0,0 +1,20 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 1, + "hidden_act": "silu", + "hidden_size": 512, + "intermediate_size": 1376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 8, + "num_hidden_layers": 8, + "pad_token_id": -1, + "rms_norm_eps": 1e-06, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/config/llama_7b.json b/config/llama_7b.json new file mode 100644 index 0000000..cf594ec --- /dev/null +++ b/config/llama_7b.json @@ -0,0 +1,20 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 1, + "hidden_act": "silu", + "hidden_size": 4096, + "intermediate_size": 11008, + "initializer_range": 0.02, + "max_sequence_length": 2048, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "pad_token_id": -1, + "rms_norm_eps": 1e-06, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/config/lora_config.json b/config/lora_config.json new file mode 100644 index 0000000..be7c7aa --- /dev/null +++ b/config/lora_config.json @@ -0,0 +1,7 @@ +{ + "r": 8, + "lora_alpha": 8, + "lora_dropout": 0.1, + "target_modules_finetuning": ["query", "value"], + "target_modules_pretraining": ["q_proj", "v_proj"] +} diff --git a/load_data.py b/load_data.py new file mode 100644 index 0000000..028a2a1 --- /dev/null +++ b/load_data.py @@ -0,0 +1,37 @@ +import torch +from datasets import load_dataset + +def load_data(args, tokenizer): + if args.mode == "pretraining": + return load_data_pretrain(args, tokenizer) + elif args.mode == "finetuning": + return load_data_finetune(args, tokenizer) + else: + raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + +def load_data_pretrain(args, tokenizer): + dataset = load_dataset("allenai/c4", "realnewslike", streaming=True, split="train") + dataset = dataset.take(args.num_training_tokens) + + def tokenize_function_pretrain(batch): + encoding = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=args.max_length) + return { + "input_ids": torch.tensor(encoding["input_ids"]).clone().detach().to(torch.long), + "attention_mask": torch.tensor(encoding["attention_mask"]).clone().detach().to(torch.long), + } + + dataset = dataset.map(tokenize_function_pretrain, remove_columns=["text", "timestamp", "url"]) + dataset.with_format("torch") + + return dataset + +def load_data_finetune(args, tokenizer): + dataset = load_dataset("glue", "sst2") + + def tokenize_function_finetune(batch): + return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=args.max_length) + + dataset = dataset.map(tokenize_function_finetune) + dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) + + return dataset diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..13e465e --- /dev/null +++ b/logger.py @@ -0,0 +1,38 @@ +import torch +import psutil +import csv +import math + +CSV_FILE = "output.csv" + +def init_csv(): + """Initialize CSV file with headers.""" + with open(CSV_FILE, mode="w", newline="") as file: + writer = csv.writer(file) + writer.writerow(["epoch", "training_step", "compute_time", "peak_memory_usage_history_GB", + "peak_memory_usage_allocated_GB", "peak_memory_usage_reserved_GB", + "loss", "perplexity"]) + +def measure_memory(): + """Measure memory usage from CUDA or CPU.""" + if torch.cuda.is_available(): + history = torch.cuda.memory._record_memory_history() + peak_history = max([entry["allocated_bytes.all.current"] for entry in history]) / 1e9 if history else 0 + max_allocated = torch.cuda.max_memory_allocated() / 1e9 + max_reserved = torch.cuda.max_memory_reserved() / 1e9 + else: + mem = psutil.virtual_memory() + peak_history = mem.used / 1e9 + max_allocated = mem.used / 1e9 + max_reserved = mem.total / 1e9 # Total system memory + + return peak_history, max_allocated, max_reserved + +def log_to_csv(epoch, step, compute_time, loss): + """Log training metrics to CSV file.""" + peak_history, max_allocated, max_reserved = measure_memory() + perplexity = math.exp(loss) if loss < 100 else float("inf") # Avoid overflow + + with open(CSV_FILE, mode="a", newline="") as file: + writer = csv.writer(file) + writer.writerow([epoch, step, compute_time, peak_history, max_allocated, max_reserved, loss, perplexity]) diff --git a/main.py b/main.py new file mode 100644 index 0000000..edc326f --- /dev/null +++ b/main.py @@ -0,0 +1,178 @@ +from load_data import load_data +from galore_torch import GaLoreAdamW, GaLoreAdamW8bit +from logger import init_csv, log_to_csv +from accelerate import Accelerator +from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer +import torch +from torch.utils.data import DataLoader +from args import args +from peft import LoraConfig, get_peft_model +from torch.optim import AdamW +import json +import datetime + +def get_model(args): + """ Creates model for Pretraining or Fine-Tuning """ + if args.mode == "pretraining": + model_config = AutoConfig.from_pretrained(f"config/{args.model}.json") + if args.dtype == "bf16": + model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16) + else: + model = AutoModelForCausalLM.from_config(model_config) + + # in the galore project they say: + # "it doesn't matter which tokenizer we use, because we train from scratch + # T5 tokenizer was trained on C4 and we are also training on C4, so it's a good choice" + tokenizer = AutoTokenizer.from_pretrained("t5-base") + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + if model.config.pad_token_id is None or model.config.pad_token_id == -1: + model.config.pad_token_id = tokenizer.pad_token_id + model.generation_config.pad_token_id = model.config.pad_token_id + + elif args.mode == "finetuning": + if args.model == "roberta": + if args.dtype == "bf16": + model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, torch_dtype=torch.bfloat16) + else: + model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2) + tokenizer = AutoTokenizer.from_pretrained("roberta-base") + elif args.model == "gpt2": + model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2) + tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side = "left") + + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = tokenizer.pad_token_id + else: + raise ValueError("Invalid model name. Choose 'roberta' or 'gpt2'") + else: + raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + + return model, tokenizer + +def load_lora_config(args): + """Loads LoRa configuration from file""" + with open(args.lora_config, "r") as f: + lora_params = json.load(f) + + target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params["target_modules_pretraining"] + + return LoraConfig( + r=lora_params["r"], + lora_alpha=lora_params["lora_alpha"], + lora_dropout=lora_params["lora_dropout"], + target_modules=target_modules + ) + +def load_galore_config(args): + """Loads GaLore configuration from file""" + with open(args.galore_config, "r") as f: + return json.load(f) + +def get_optimizer(args, model): + """Creates optimizer (GaLore, LoRa, or baseline AdamW)""" + if args.optimizer == "baseline": + return AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay), model + elif args.optimizer in ["galore", "galore8bit"]: + galore_config = load_galore_config(args) + trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] + + param_groups = [ + {"params": trainable_params, **galore_config} + ] + optimizer_class = GaLoreAdamW if args.optimizer == "galore" else GaLoreAdamW8bit + return optimizer_class(param_groups, lr=args.lr, weight_decay=args.weight_decay), model + elif args.optimizer in ["lora", "lora+galore8bit"]: + lora_config = load_lora_config(args) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + + if args.optimizer == "lora": + return AdamW(model.parameters(), lr=args.lr), model + else: + galore_config = load_galore_config() + trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1] + param_groups = [ + {"params": trainable_params, **galore_config} + ] + return GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay), model + else: + raise ValueError(f"Unknown optimizer: {args.optimizer}") + +def train(device, accelerator, scheduler, model, optimizer, dataloader, num_epochs): + """ training model """ + model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader) + model.train() + start_time = datetime.datetime.now() + + for epoch in range(num_epochs): + total_loss = 0 + batch_cnt = 0 + for batch in dataloader: + optimizer.zero_grad() + + if args.mode == "pretraining": + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + + outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids) + elif args.mode == "finetuning": + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + labels = batch["label"].to(device) + + outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) + else: + raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + + loss = outputs.loss + accelerator.backward(loss) + optimizer.step() + scheduler.step() + + compute_time = (datetime.datetime.now() - start_time).total_seconds() + log_to_csv(epoch + 1, batch_cnt + 1, compute_time, loss.item()) + + total_loss += loss.item() + batch_cnt += 1 + + avg_loss = total_loss / max(1, batch_cnt) + print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}") + + return model + +if __name__ == "__main__": + if (args.test == "true"): + print("Test mode") + accelerator = Accelerator() + else: + accelerator = Accelerator(mixed_precision="bf16") + + device = "cuda" if torch.cuda.is_available() else "cpu" + + print(f"Running on: {device}") + print(f"Using optimizer: {args.optimizer}") + init_csv() + + model, tokenizer = get_model(args) + + dataset = load_data(args, tokenizer) + + optimizer, model = get_optimizer(args, model) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.tmax) + + shuffle = True if args.shuffle == "true" else False + if args.mode == "pretraining": + dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle) + elif args.mode == "finetuning": + dataloader = DataLoader(dataset["train"], batch_size=args.batch_size, shuffle=shuffle) + else: + raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'") + + trained_model = train(device, accelerator, scheduler, model, optimizer, dataloader, num_epochs=args.num_epochs) + + file_name = f"{args.model}_{args.optimizer}_pretrained" if args.mode == "pretraining" else f"{args.model}_{args.optimizer}_finetuned" + model_path = f"models/{file_name}" + trained_model.save_pretrained(model_path) + tokenizer.save_pretrained(model_path) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0220976 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +galore-torch +datasets +transformers +torch +accelerate +psutil +peft +argparse \ No newline at end of file diff --git a/run_glue_benchmark.py b/run_glue_benchmark.py new file mode 100644 index 0000000..145112b --- /dev/null +++ b/run_glue_benchmark.py @@ -0,0 +1,60 @@ +import torch +import argparse +from datasets import load_dataset, load_metric +from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM +from torch.utils.data import DataLoader +from tqdm import tqdm + +# TODO just a draft - not complete, not tested + +parser = argparse.ArgumentParser() +parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model") +parser.add_argument("--task", type=str, choices=["sst2", "mnli", "qqp"], default="sst2", help="GLUE task for evaluation") +parser.add_argument("--batch_size", type=int, default=8) +parser.add_argument("--model_type", type=str, choices=["roberta", "gpt2"], required=True) +args = parser.parse_args() + +print(f"Loading model from {args.model_path}...") +if args.model_type == "roberta": + model = AutoModelForSequenceClassification.from_pretrained(args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) +elif args.model_type == "gpt2": + model = AutoModelForCausalLM.from_pretrained(args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + tokenizer.pad_token = tokenizer.eos_token + +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) +model.eval() + +dataset = load_dataset("glue", args.task) +metric = load_metric("glue", args.task) + +def preprocess_function(examples): + return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128) + +encoded_dataset = dataset["validation"].map(preprocess_function, batched=True) +dataloader = DataLoader(encoded_dataset, batch_size=args.batch_size) + +all_preds = [] +all_labels = [] + +print("Starting evaluation...") +with torch.no_grad(): + for batch in tqdm(dataloader): + input_ids = torch.tensor(batch["input_ids"]).to(device) + attention_mask = torch.tensor(batch["attention_mask"]).to(device) + labels = batch["label"] + + if args.model_type == "roberta": + outputs = model(input_ids=input_ids, attention_mask=attention_mask) + preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy() + elif args.model_type == "gpt2": + outputs = model(input_ids=input_ids, attention_mask=attention_mask) + preds = torch.argmax(outputs.logits[:, -1, :], dim=-1).cpu().numpy() + + all_preds.extend(preds) + all_labels.extend(labels) + +result = metric.compute(predictions=all_preds, references=all_labels) +print(f"Benchmark Results for {args.task}: {result}") diff --git a/scripts/shell/pretrain.sh b/scripts/shell/pretrain.sh new file mode 100644 index 0000000..8c58d90 --- /dev/null +++ b/scripts/shell/pretrain.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +python main.py \ + --mode pretraining \ + --optimizer galore \ + --model llama_1b \ + --batch_size 8 \ + --num_epochs 30 \ + --num_training_tokens 1000000 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr 1e-4 \ + --weight_decay 0.01 \ + --tmax 30 \ + --test false diff --git a/scripts/shell/test.sh b/scripts/shell/test.sh new file mode 100644 index 0000000..f1281e8 --- /dev/null +++ b/scripts/shell/test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +python main.py \ + --mode pretraining \ + --optimizer galore \ + --model llama_60m \ + --batch_size 2 \ + --num_epochs 3 \ + --num_training_tokens 100 \ + --max_length 512 \ + --shuffle false \ + --dtype bf16 \ + --lr 4e-4 \ + --weight_decay 0.01 \ + --tmax 30 \ + --test true diff --git a/scripts/slurm/run_pretrain.sh b/scripts/slurm/run_pretrain.sh new file mode 100644 index 0000000..87d00b6 --- /dev/null +++ b/scripts/slurm/run_pretrain.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +#SBATCH --job-name=galore_pretrain # Name des Jobs +#SBATCH --nodes=1 # 1 Knoten nutzen +#SBATCH --ntasks=1 # 1 Aufgabe +#SBATCH --mem=32G # 32 GB RAM zuweisen +#SBATCH --cpus-per-task=4 # 4 CPU-Kerne pro Task +#SBATCH --gres=gpu:1 # 1 GPU anfordern +#SBATCH --time=00:05:00 # Maximale Laufzeit von 5 Minuten + +# Set TMPDIR to a different directory +export TMPDIR=/home/apzgb/tmp + +# Docker-Image von Docker Hub ausführen +srun \ + --container-image=docker://tommotius/galore_image \ + --container-name=ml-container \ + --container-mounts=/home/apzgb/Dokumente/GaLoreReplication:/workspace \ + bash -i -c "cd /workspace && chmod +x ./scripts/shell/pretrain.sh" \ No newline at end of file diff --git a/scripts/windows/finetune_lora.bat b/scripts/windows/finetune_lora.bat new file mode 100644 index 0000000..fa431f5 --- /dev/null +++ b/scripts/windows/finetune_lora.bat @@ -0,0 +1,15 @@ +@echo off +python main.py ^ + --mode finetuning ^ + --optimizer lora ^ + --model roberta ^ + --batch_size 8 ^ + --num_epochs 30 ^ + --max_length 512 ^ + --num_training_tokens 1000000 ^ + --shuffle false ^ + --dtype bf16 ^ + --lr 4e-4 ^ + --weight_decay 0.01 ^ + --tmax 30 ^ + --test true \ No newline at end of file diff --git a/scripts/windows/test.bat b/scripts/windows/test.bat new file mode 100644 index 0000000..6fbf418 --- /dev/null +++ b/scripts/windows/test.bat @@ -0,0 +1,15 @@ +@echo off +python main.py ^ + --mode pretraining ^ + --optimizer galore ^ + --model llama_60m ^ + --batch_size 8 ^ + --num_epochs 30 ^ + --max_length 512 ^ + --num_training_tokens 1000000 ^ + --shuffle false ^ + --dtype bf16 ^ + --lr 4e-4 ^ + --weight_decay 0.01 ^ + --tmax 30 ^ + --test true \ No newline at end of file -- GitLab