Skip to content
Snippets Groups Projects
Commit c9bcc6b6 authored by Armin Bacher's avatar Armin Bacher
Browse files

Delete GPT-2-Small-2k.py

parent f8237b9f
Branches
No related tags found
No related merge requests found
import time
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset
from torch.utils.data import DataLoader
# --- Einstellungen ---
BATCH_SIZE = 16
SEQ_LEN = 2048
NUM_STEPS = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MIXED_PRECISION = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
# GPT-2 Small Model Parameter
NUM_LAYERS = 12
HIDDEN_SIZE = 768
NUM_HEADS = 12
HEAD_DIM = HIDDEN_SIZE // NUM_HEADS
dataset = load_dataset("wikitext", "wikitext-103-v1")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=SEQ_LEN)
tokens["labels"] = tokens["input_ids"].copy() # Labels = Shifted input_ids
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader = DataLoader(tokenized_datasets["train"], batch_size=BATCH_SIZE, collate_fn=data_collator)
def compute_flops(batch_size, seq_len, num_layers, hidden_size):
""" Berechnet die FLOPs für das Training """
return 6 * num_layers * (hidden_size ** 2) * seq_len * batch_size
def benchmark_training(model, dataloader, num_steps=NUM_STEPS):
model.to(DEVICE)
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
torch.cuda.synchronize()
start_time = time.time()
for step, batch in enumerate(dataloader):
if step >= num_steps:
break
batch = {k: v.to(DEVICE, dtype=torch.long) if k == "input_ids" or k == "labels" else v.to(DEVICE, dtype=MIXED_PRECISION) for k, v in batch.items()}
loss = model(**batch).loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
torch.cuda.synchronize()
total_time = time.time() - start_time
tokens_per_second = (num_steps * BATCH_SIZE * SEQ_LEN) / total_time
flops_per_step = compute_flops(BATCH_SIZE, SEQ_LEN, NUM_LAYERS, HIDDEN_SIZE)
tflops_per_sec = (flops_per_step * (tokens_per_second / (BATCH_SIZE * SEQ_LEN))) / 1e12
return tokens_per_second, tflops_per_sec
results = {}
for attn_type in ["standard", "flash2"]:
print(f"Teste GPT-2 Small mit {attn_type} Attention...")
if attn_type == "standard":
model = GPT2LMHeadModel.from_pretrained("gpt2", torch_dtype=MIXED_PRECISION).to(DEVICE)
elif attn_type == "flash2":
model = GPT2LMHeadModel.from_pretrained("gpt2", attn_implementation="flash_attention_2", torch_dtype=MIXED_PRECISION).to(DEVICE)
tokens_per_sec, tflops_per_sec = benchmark_training(model, dataloader)
results[attn_type] = (tokens_per_sec, tflops_per_sec)
print(f"{attn_type} Attention: {tokens_per_sec:.2f} Tokens pro Sekunde, {tflops_per_sec:.2f} TFLOPS/s")
print("Endergebnisse:")
for attn_type, (speed, tflops) in results.items():
print(f"{attn_type.capitalize()} Attention: {speed:.2f} Tokens pro Sekunde, {tflops:.2f} TFLOPS/s")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment