From e03afc92c2dce264dec7e27249a6f8b6bc852a6a Mon Sep 17 00:00:00 2001 From: Till-Ole Herbst <till-ole.herbst@student.uni-halle.de> Date: Wed, 4 Sep 2024 11:26:09 +0200 Subject: [PATCH] pipeline --- batch_pipeline.sh | 10 ++++++ pipeline.sh | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 batch_pipeline.sh create mode 100644 pipeline.sh diff --git a/batch_pipeline.sh b/batch_pipeline.sh new file mode 100644 index 000000000..058e287b5 --- /dev/null +++ b/batch_pipeline.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#SBATCH --job-name=pipe1 +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-gpu=8 +#SBATCH --mem-per-cpu=4G + +cd /home/ambcj/BA/text2sql-sft +source dbgpt-hub/bin/activate + +sh pipeline.sh diff --git a/pipeline.sh b/pipeline.sh new file mode 100644 index 000000000..13f3937c9 --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,77 @@ +wandb offline # Close wandb +# a100 ,单卡 +current_date=$(date +"%Y%m%d_%H%M") +train_log="output/logs/train_sft_test_${current_date}.log" +start_time=$(date +%s) +echo " Train Start time: $(date -d @$start_time +'%Y-%m-%d %H:%M:%S')" >>${train_log} + +# default train , zero-shot, +num_shot=0 + +# one-shot train +# num_shot=1 + +dataset_train="example_text2sql_train" +dataset_dev="data/example_text2sql_dev.json" +if [ "$num_shot" -eq 1 ]; then + dataset="example_text2sql_train_one_shot" +fi +model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct" +output_dir="output/adapter/llama3_instruct_qlora" + +# the default param set could be run in a server with one a100(40G) gpu, if your server not support the set,you can set smaller param such as lora_rank and use qlora with quant 4 eg... +CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') python -m src.sft_train \ + --model_name_or_path $model_name_or_path \ + --quantization_bit 4 \ + --do_train \ + --dataset $dataset_train \ + --max_source_length 2048 \ + --max_target_length 512 \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --template llama3 \ + --lora_rank 64 \ + --lora_alpha 32 \ + --output_dir $output_dir \ + --overwrite_cache \ + --overwrite_output_dir \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 16 \ + --lr_scheduler_type cosine_with_restarts \ + --logging_steps 50 \ + --save_steps 2000 \ + --learning_rate 2e-4 \ + --num_train_epoch 8 \ + --plot_loss \ + --bf16 >> ${train_log} + # --bf16#v100不支持bf16 + +echo "############train end###############" >>${train_log} +echo "Train End time: $(date)" >>${train_log} +end_time=$(date +%s) +duration=$((end_time - start_time)) +hours=$((duration / 3600)) +min=$(( (duration % 3600) / 60)) +echo "Time elapsed: ${hour} hour $min min " >>${train_log} + +current_date=$(date +"%Y%m%d_%H%M") +pred_log="output/logs/pred_test_${current_date}.log" +start_time=$(date +%s) +echo " Pred Start time: $(date -d @$start_time +'%Y-%m-%d %H:%M:%S')" >>${pred_log} + +CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') python -m src.predict \ + --model_name_or_path $model_name_or_path \ + --template llama3 \ + --quantization_bit 4 \ + --finetuning_type lora \ + --predicted_input_filename $dataset_dev \ + --checkpoint_dir $output_dir \ + --predicted_out_filename output/pred/pred_llama3_instruct_qlora.sql >> ${pred_log} + +echo "############pred end###############" >>${pred_log} +echo "pred End time: $(date)" >>${pred_log} +end_time=$(date +%s) +duration=$((end_time - start_time)) +hours=$((duration / 3600)) +min=$(( (duration % 3600) / 60)) +echo "Time elapsed: ${hour} hour $min min " >>${pred_log} -- GitLab