diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e7d32d60bb42a2d0e34a06d1cefaba1d9dc7bde5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+test.py
+output.txt
+output.csv
+models/
diff --git a/Dockerfile b/Dockerfile
index d119cb3f8e9ba88b1534f32f9d0aaa8d614094d6..355ff3e93caee71b8bb00101314fd719d93dc990 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1 +1,13 @@
 FROM nvidia/cuda:12.3.0-devel-ubuntu20.04
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --upgrade pip
+
+COPY requirements.txt /workspace/requirements.txt
+RUN pip3 install -r /workspace/requirements.txt
+
+WORKDIR /workspace
diff --git a/args.py b/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..873e4ff9d8b6ef20a17b9e707d4395b42442b38e
--- /dev/null
+++ b/args.py
@@ -0,0 +1,19 @@
+import argparse
+
+parser = argparse.ArgumentParser(description="Run training")
+parser.add_argument("--mode", type=str, choices=["pretraining", "finetuning"], required=True, help="Training mode to use")
+parser.add_argument("--optimizer", type=str, choices=["lora", "galore", "galore8bit", "lora+galore8bit", "baseline"], required=True, help="Optimizer type to use")
+parser.add_argument("--model", type=str, choices=["llama_60m", "llama_1b", "llama_7b", "roberta", "gpt2"], required=True, help="Model to use")
+parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
+parser.add_argument("--num_epochs", type=int, default=30, help="Number of epochs")
+parser.add_argument("--max_length", type=int, default=512, help="Max length of input tokens")
+parser.add_argument("--num_training_tokens", type=int, default=1e9, help="Number of training tokens")
+parser.add_argument("--shuffle", type=str, choices=["true", "false"], default="false", help="Shuffle data (doesn't work in streaming mode)")
+parser.add_argument("--dtype", type=str, choices=["bf16", "fp16"], default="fp16", help="Data type to use") # TODO for now just bf16 working
+parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate for optimizer")
+parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer")
+parser.add_argument("--tmax", type=int, default=30, help="Tmax for scheduler")
+parser.add_argument("--lora_config", type=str, default="config/lora_config.json", help="Path to LoRa config file")
+parser.add_argument("--galore_config", type=str, default="config/galore_config.json", help="Path to GaLore config file")
+parser.add_argument("--test", type=str, choices=["true", "false"], default="false", help="Test mode")
+args = parser.parse_args()
diff --git a/config/README.md b/config/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..88bb8eaab9695a9d64ffdd63d11c14a4fdb251cd
--- /dev/null
+++ b/config/README.md
@@ -0,0 +1,4 @@
+# Copyright Notice
+
+The LLaMa config files are the ones used in the original [GaLore project](https://github.com/jiaweizzhao/GaLore).\
+Since we want to replicate the results, we're using exactly the same settings for the LLaMa models in our experiences.
\ No newline at end of file
diff --git a/config/galore_config.json b/config/galore_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..59a1f652bb6738cb6fb29035ca37737145b27884
--- /dev/null
+++ b/config/galore_config.json
@@ -0,0 +1,6 @@
+{
+    "rank": 128,
+    "update_proj_gap": 200,
+    "scale": 0.25,
+    "proj_type": "std"
+}
diff --git a/config/llama_1b.json b/config/llama_1b.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a068c150bf5d9ad3fec3c0786d9c7a30faa1dcb
--- /dev/null
+++ b/config/llama_1b.json
@@ -0,0 +1,20 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "intermediate_size": 5461,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 24,
+    "pad_token_id": -1,
+    "rms_norm_eps": 1e-06,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/config/llama_60m.json b/config/llama_60m.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd49952950570b09152e6a07179f50d7df31503
--- /dev/null
+++ b/config/llama_60m.json
@@ -0,0 +1,20 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "hidden_act": "silu",
+    "hidden_size": 512,
+    "intermediate_size": 1376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 8,
+    "pad_token_id": -1,
+    "rms_norm_eps": 1e-06,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/config/llama_7b.json b/config/llama_7b.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf594eca9a98dc764f8f72bf99665ba04522e943
--- /dev/null
+++ b/config/llama_7b.json
@@ -0,0 +1,20 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "intermediate_size": 11008,
+    "initializer_range": 0.02,
+    "max_sequence_length": 2048,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "pad_token_id": -1,
+    "rms_norm_eps": 1e-06,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/config/lora_config.json b/config/lora_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..be7c7aa3424bdb327a3c8f4e9e9be716dd0976ad
--- /dev/null
+++ b/config/lora_config.json
@@ -0,0 +1,7 @@
+{
+    "r": 8,
+    "lora_alpha": 8,
+    "lora_dropout": 0.1,
+    "target_modules_finetuning": ["query", "value"],
+    "target_modules_pretraining": ["q_proj", "v_proj"]
+}
diff --git a/load_data.py b/load_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..028a2a14abf29d79a122e82c05c5052b62d96abc
--- /dev/null
+++ b/load_data.py
@@ -0,0 +1,37 @@
+import torch
+from datasets import load_dataset
+
+def load_data(args, tokenizer):
+    if args.mode == "pretraining":
+        return load_data_pretrain(args, tokenizer)
+    elif args.mode == "finetuning":
+        return load_data_finetune(args, tokenizer)
+    else:
+        raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
+
+def load_data_pretrain(args, tokenizer):
+    dataset = load_dataset("allenai/c4", "realnewslike", streaming=True, split="train")
+    dataset = dataset.take(args.num_training_tokens)
+
+    def tokenize_function_pretrain(batch):
+        encoding = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=args.max_length)
+        return {
+            "input_ids": torch.tensor(encoding["input_ids"]).clone().detach().to(torch.long),
+            "attention_mask": torch.tensor(encoding["attention_mask"]).clone().detach().to(torch.long),
+        }
+
+    dataset = dataset.map(tokenize_function_pretrain, remove_columns=["text", "timestamp", "url"])
+    dataset.with_format("torch")
+
+    return dataset
+
+def load_data_finetune(args, tokenizer):
+    dataset = load_dataset("glue", "sst2")
+
+    def tokenize_function_finetune(batch):
+        return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=args.max_length)
+    
+    dataset = dataset.map(tokenize_function_finetune)
+    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
+
+    return dataset
diff --git a/logger.py b/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e465ec578b1e3ce02a3cf154892c211a03bce0
--- /dev/null
+++ b/logger.py
@@ -0,0 +1,38 @@
+import torch
+import psutil
+import csv
+import math
+
+CSV_FILE = "output.csv"
+
+def init_csv():
+    """Initialize CSV file with headers."""
+    with open(CSV_FILE, mode="w", newline="") as file:
+        writer = csv.writer(file)
+        writer.writerow(["epoch", "training_step", "compute_time", "peak_memory_usage_history_GB", 
+                         "peak_memory_usage_allocated_GB", "peak_memory_usage_reserved_GB", 
+                         "loss", "perplexity"])
+
+def measure_memory():
+    """Measure memory usage from CUDA or CPU."""
+    if torch.cuda.is_available():
+        history = torch.cuda.memory._record_memory_history()
+        peak_history = max([entry["allocated_bytes.all.current"] for entry in history]) / 1e9 if history else 0
+        max_allocated = torch.cuda.max_memory_allocated() / 1e9
+        max_reserved = torch.cuda.max_memory_reserved() / 1e9
+    else:
+        mem = psutil.virtual_memory()
+        peak_history = mem.used / 1e9
+        max_allocated = mem.used / 1e9
+        max_reserved = mem.total / 1e9  # Total system memory
+
+    return peak_history, max_allocated, max_reserved
+
+def log_to_csv(epoch, step, compute_time, loss):
+    """Log training metrics to CSV file."""
+    peak_history, max_allocated, max_reserved = measure_memory()
+    perplexity = math.exp(loss) if loss < 100 else float("inf")  # Avoid overflow
+
+    with open(CSV_FILE, mode="a", newline="") as file:
+        writer = csv.writer(file)
+        writer.writerow([epoch, step, compute_time, peak_history, max_allocated, max_reserved, loss, perplexity])
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc326f04addfdbd8404df6575d2c8cb294194cd
--- /dev/null
+++ b/main.py
@@ -0,0 +1,178 @@
+from load_data import load_data
+from galore_torch import GaLoreAdamW, GaLoreAdamW8bit
+from logger import init_csv, log_to_csv
+from accelerate import Accelerator
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+import torch
+from torch.utils.data import DataLoader
+from args import args
+from peft import LoraConfig, get_peft_model
+from torch.optim import AdamW
+import json
+import datetime
+
+def get_model(args):
+    """ Creates model for Pretraining or Fine-Tuning """
+    if args.mode == "pretraining":
+        model_config = AutoConfig.from_pretrained(f"config/{args.model}.json")
+        if args.dtype == "bf16":
+            model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16)
+        else:
+            model = AutoModelForCausalLM.from_config(model_config)
+        
+        # in the galore project they say: 
+        # "it doesn't matter which tokenizer we use, because we train from scratch
+        # T5 tokenizer was trained on C4 and we are also training on C4, so it's a good choice"
+        tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        if model.config.pad_token_id is None or model.config.pad_token_id == -1:
+            model.config.pad_token_id = tokenizer.pad_token_id
+        model.generation_config.pad_token_id = model.config.pad_token_id
+
+    elif args.mode == "finetuning":
+        if args.model == "roberta":
+            if args.dtype == "bf16":
+                model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, torch_dtype=torch.bfloat16)
+            else:
+                model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
+            tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+        elif args.model == "gpt2":
+            model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
+            tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side = "left")
+
+            tokenizer.pad_token = tokenizer.eos_token
+            model.config.pad_token_id = tokenizer.pad_token_id
+        else:
+            raise ValueError("Invalid model name. Choose 'roberta' or 'gpt2'")
+    else:
+        raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
+    
+    return model, tokenizer
+
+def load_lora_config(args):
+    """Loads LoRa configuration from file"""
+    with open(args.lora_config, "r") as f:
+        lora_params = json.load(f)
+
+    target_modules = lora_params["target_modules_finetuning"] if args.mode == "finetuning" else lora_params["target_modules_pretraining"]
+
+    return LoraConfig(
+        r=lora_params["r"],
+        lora_alpha=lora_params["lora_alpha"],
+        lora_dropout=lora_params["lora_dropout"],
+        target_modules=target_modules
+    )
+
+def load_galore_config(args):
+    """Loads GaLore configuration from file"""
+    with open(args.galore_config, "r") as f:
+        return json.load(f)
+
+def get_optimizer(args, model):
+    """Creates optimizer (GaLore, LoRa, or baseline AdamW)"""
+    if args.optimizer == "baseline":
+        return AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay), model
+    elif args.optimizer in ["galore", "galore8bit"]:
+        galore_config = load_galore_config(args)
+        trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1]
+
+        param_groups = [
+            {"params": trainable_params, **galore_config}
+        ]
+        optimizer_class = GaLoreAdamW if args.optimizer == "galore" else GaLoreAdamW8bit
+        return optimizer_class(param_groups, lr=args.lr, weight_decay=args.weight_decay), model
+    elif args.optimizer in ["lora", "lora+galore8bit"]:
+        lora_config = load_lora_config(args)
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+        
+        if args.optimizer == "lora":
+            return AdamW(model.parameters(), lr=args.lr), model
+        else:
+            galore_config = load_galore_config()
+            trainable_params = [p for p in model.parameters() if p.requires_grad and p.dim() > 1]
+            param_groups = [
+                {"params": trainable_params, **galore_config}
+            ]
+            return GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay), model
+    else:
+        raise ValueError(f"Unknown optimizer: {args.optimizer}")
+
+def train(device, accelerator, scheduler, model, optimizer, dataloader, num_epochs):
+    """ training model """
+    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+    model.train()
+    start_time = datetime.datetime.now()
+    
+    for epoch in range(num_epochs):
+        total_loss = 0
+        batch_cnt = 0
+        for batch in dataloader:
+            optimizer.zero_grad()
+            
+            if args.mode == "pretraining":
+                input_ids = batch["input_ids"].to(device)
+                attention_mask = batch["attention_mask"].to(device)
+
+                outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+            elif args.mode == "finetuning":
+                input_ids = batch["input_ids"].to(device)
+                attention_mask = batch["attention_mask"].to(device)
+                labels = batch["label"].to(device)
+
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+            else:
+                raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
+
+            loss = outputs.loss
+            accelerator.backward(loss)
+            optimizer.step()
+            scheduler.step()
+            
+            compute_time = (datetime.datetime.now() - start_time).total_seconds()
+            log_to_csv(epoch + 1, batch_cnt + 1, compute_time, loss.item())
+
+            total_loss += loss.item()
+            batch_cnt += 1
+        
+        avg_loss = total_loss / max(1, batch_cnt)
+        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
+    
+    return model
+
+if __name__ == "__main__":
+    if (args.test == "true"):
+        print("Test mode")
+        accelerator = Accelerator()
+    else:
+        accelerator = Accelerator(mixed_precision="bf16")
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    print(f"Running on: {device}")
+    print(f"Using optimizer: {args.optimizer}")
+    init_csv()
+
+    model, tokenizer = get_model(args)
+
+    dataset = load_data(args, tokenizer)
+
+    optimizer, model = get_optimizer(args, model)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.tmax)
+
+    shuffle = True if args.shuffle == "true" else False
+    if args.mode == "pretraining":
+        dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle)
+    elif args.mode == "finetuning":
+        dataloader = DataLoader(dataset["train"], batch_size=args.batch_size, shuffle=shuffle)
+    else:
+        raise ValueError("Invalid mode. Choose 'pretraining' or 'finetuning'")
+
+    trained_model = train(device, accelerator, scheduler, model, optimizer, dataloader, num_epochs=args.num_epochs)
+
+    file_name = f"{args.model}_{args.optimizer}_pretrained" if args.mode == "pretraining" else f"{args.model}_{args.optimizer}_finetuned"
+    model_path = f"models/{file_name}"
+    trained_model.save_pretrained(model_path)
+    tokenizer.save_pretrained(model_path)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..022097615bd165545cfa6c1325d860ec3aaa8ae1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+galore-torch
+datasets
+transformers
+torch
+accelerate
+psutil
+peft
+argparse
\ No newline at end of file
diff --git a/run_glue_benchmark.py b/run_glue_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..145112b8544b8d1f655b8fcb64885fa1d26a295d
--- /dev/null
+++ b/run_glue_benchmark.py
@@ -0,0 +1,60 @@
+import torch
+import argparse
+from datasets import load_dataset, load_metric
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+# TODO just a draft - not complete, not tested
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model")
+parser.add_argument("--task", type=str, choices=["sst2", "mnli", "qqp"], default="sst2", help="GLUE task for evaluation")
+parser.add_argument("--batch_size", type=int, default=8)
+parser.add_argument("--model_type", type=str, choices=["roberta", "gpt2"], required=True)
+args = parser.parse_args()
+
+print(f"Loading model from {args.model_path}...")
+if args.model_type == "roberta":
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+elif args.model_type == "gpt2":
+    model = AutoModelForCausalLM.from_pretrained(args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+model.eval()
+
+dataset = load_dataset("glue", args.task)
+metric = load_metric("glue", args.task)
+
+def preprocess_function(examples):
+    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)
+
+encoded_dataset = dataset["validation"].map(preprocess_function, batched=True)
+dataloader = DataLoader(encoded_dataset, batch_size=args.batch_size)
+
+all_preds = []
+all_labels = []
+
+print("Starting evaluation...")
+with torch.no_grad():
+    for batch in tqdm(dataloader):
+        input_ids = torch.tensor(batch["input_ids"]).to(device)
+        attention_mask = torch.tensor(batch["attention_mask"]).to(device)
+        labels = batch["label"]
+
+        if args.model_type == "roberta":
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
+        elif args.model_type == "gpt2":
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            preds = torch.argmax(outputs.logits[:, -1, :], dim=-1).cpu().numpy()
+
+        all_preds.extend(preds)
+        all_labels.extend(labels)
+
+result = metric.compute(predictions=all_preds, references=all_labels)
+print(f"Benchmark Results for {args.task}: {result}")
diff --git a/scripts/shell/pretrain.sh b/scripts/shell/pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8c58d900571f5e60f265392d66f6b859348c36ba
--- /dev/null
+++ b/scripts/shell/pretrain.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+python main.py \
+    --mode pretraining \
+    --optimizer galore \
+    --model llama_1b \
+    --batch_size 8 \
+    --num_epochs 30 \
+    --num_training_tokens 1000000 \
+    --max_length 512 \
+    --shuffle false \
+    --dtype bf16 \
+    --lr 1e-4 \
+    --weight_decay 0.01 \
+    --tmax 30 \
+    --test false
diff --git a/scripts/shell/test.sh b/scripts/shell/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1281e87063ac08a529abbb55fc3b654e2689e88
--- /dev/null
+++ b/scripts/shell/test.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+python main.py \
+    --mode pretraining \
+    --optimizer galore \
+    --model llama_60m \
+    --batch_size 2 \
+    --num_epochs 3 \
+    --num_training_tokens 100 \
+    --max_length 512 \
+    --shuffle false \
+    --dtype bf16 \
+    --lr 4e-4 \
+    --weight_decay 0.01 \
+    --tmax 30 \
+    --test true
diff --git a/scripts/slurm/run_pretrain.sh b/scripts/slurm/run_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..87d00b6a722885820f5d23bde77e16573e7aedba
--- /dev/null
+++ b/scripts/slurm/run_pretrain.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#SBATCH --job-name=galore_pretrain   # Name des Jobs
+#SBATCH --nodes=1                    # 1 Knoten nutzen
+#SBATCH --ntasks=1                    # 1 Aufgabe
+#SBATCH --mem=32G                     # 32 GB RAM zuweisen
+#SBATCH --cpus-per-task=4             # 4 CPU-Kerne pro Task
+#SBATCH --gres=gpu:1                  # 1 GPU anfordern
+#SBATCH --time=00:05:00               # Maximale Laufzeit von 5 Minuten
+
+# Set TMPDIR to a different directory
+export TMPDIR=/home/apzgb/tmp
+
+# Docker-Image von Docker Hub ausführen
+srun \
+    --container-image=docker://tommotius/galore_image \
+    --container-name=ml-container \
+    --container-mounts=/home/apzgb/Dokumente/GaLoreReplication:/workspace \
+    bash -i -c "cd /workspace && chmod +x ./scripts/shell/pretrain.sh"
\ No newline at end of file
diff --git a/scripts/windows/finetune_lora.bat b/scripts/windows/finetune_lora.bat
new file mode 100644
index 0000000000000000000000000000000000000000..fa431f54db483b1f4bafcdcfd7c1f31a0cb03643
--- /dev/null
+++ b/scripts/windows/finetune_lora.bat
@@ -0,0 +1,15 @@
+@echo off
+python main.py ^
+    --mode finetuning ^
+    --optimizer lora ^
+    --model roberta ^
+    --batch_size 8 ^
+    --num_epochs 30 ^
+    --max_length 512 ^
+    --num_training_tokens 1000000 ^
+    --shuffle false ^
+    --dtype bf16 ^
+    --lr 4e-4 ^
+    --weight_decay 0.01 ^
+    --tmax 30 ^
+    --test true
\ No newline at end of file
diff --git a/scripts/windows/test.bat b/scripts/windows/test.bat
new file mode 100644
index 0000000000000000000000000000000000000000..6fbf4183e32cf32142c95cfa915981a79259ccfc
--- /dev/null
+++ b/scripts/windows/test.bat
@@ -0,0 +1,15 @@
+@echo off
+python main.py ^
+    --mode pretraining ^
+    --optimizer galore ^
+    --model llama_60m ^
+    --batch_size 8 ^
+    --num_epochs 30 ^
+    --max_length 512 ^
+    --num_training_tokens 1000000 ^
+    --shuffle false ^
+    --dtype bf16 ^
+    --lr 4e-4 ^
+    --weight_decay 0.01 ^
+    --tmax 30 ^
+    --test true
\ No newline at end of file