From efe196164e0afe97e3d38feb00515c0f8d344568 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Sun, 9 Feb 2025 23:08:31 +0100 Subject: [PATCH 01/12] Added untested draft for sbatch script --- scripts/batch.sh | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/batch.sh diff --git a/scripts/batch.sh b/scripts/batch.sh new file mode 100644 index 0000000..3ed016f --- /dev/null +++ b/scripts/batch.sh @@ -0,0 +1,97 @@ +#!/bin/bash +#SBATCH -e slurm-%j.err # error file-path +#SBATCH -o slurm-%j.out # output file-path +#SBATCH --nodes=1 # number of work nodes +#SBATCH --ntasks=1 # number of tasks +#SBATCH --mem=4G # reserved RAM +#SBATCH --cpus-per-task=1 # number of CPU cores per task, alternative: --cpus-per-gpu +#SBATCH --gres=gpu:0 # number of GPUs +#SBATCH --time=00:15:00 # Time limit for the entire job + +KUERZEL=$1 +REPOPATH=$2 + +# Check argurment number +if [ "$#" -ne 2 ]; then + echo "Found incorrect number of arguments ($#)." + echo "Please one user shorthand and one path to the zip file of the re-CHESS repo:" + printf '\tbatch.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + +# Check if the second argument is an existing file +if [ ! -f $2 ]; then + echo "File $2 does not exist." + echo " Please specify the path to the zip file of the re-CHESS repo:" + printf '\tbatch.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + +# Create user directory on /zpool1/slurm_data/ if it doesn't exist yet +srun if [ ! -d /zpool1/slurm_data/${KUERZEL} ]; then mkdir /zpool1/slurm_data/${KUERZEL}; fi + +# Create directory for the job +srun mkdir /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} +srun mkdir /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess + +RECHESSROOT=/zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess + +# Copy re-CHESS repo to the jobs folder and unpack +sbcast $2 /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess.zip +srun unzip /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess.zip -d ${RECHESSROOT} + +# Download and unpack Ollama +srun curl -L https://ollama.com/download/ollama-linux-amd64.tgz -O ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz +srun tar -xzvf ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz -C ${RECHESSROOT}/ollama +OLPATH=${RECHESSROOT}/ollama +OLLOGFILE=/dev/null +OLLAMA_MODELS=${OLPATH}/model + +# Pull llama3:70b +srun ${OLPATH}/./bin/ollama serve & sleep 10 ; ${OLPATH}/./bin/ollama pull llama3:70b ; kill %1 + +# Download the BIRD & Spider dataset +BIRDPATH=$RECHESSROOT/CHESS/data/BIRD +SPIDERPATH=$RECHESSROOT/CHESS/data/Spider +srun if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; fi +srun if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; fi +srun curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o ${BIRDPATH}/dev.zip +srun curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o ${BIRDPATH}/train.zip +srun curl -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o ${SPIDERPATH}/spider_data.zip + +# Unpack BIRD & Spider datasets +srun unzip ${BIRDPATH}/dev.zip -d ${BIRDPATH} +srun unzip ${BIRDPATH}/train.zip -d ${BIRDPATH} +srun unzip ${SPIDERPATH}/spider_data.zip -d ${SPIDERPATH} + +# Unpack BIRD specific sub-directory +srun mv ${BIRDPATH}/dev_20240627 ${BIRDPATH}/dev +srun unzip ${BIRDPATH}/dev_databases.zip -d ${BIRDPATH}/dev +srun unzip ${BIRDPATH}/train_databases.zip -d ${BIRDPATH}/train + +# Create a Python venv +PYTHONVER=$(py3versions --default) +srun ${PYTHONVER} -m venv ${RECHESSROOT}/CHESS/venv +VENVPATH=${RECHESSROOT}/CHESS/venv +srun source ${VENVPATH}/bin/activate ; pip3 install -r ${RECHESSROOT}/CHESS/requirements.txt ; deactivate + +# copy the .env.rechess file to .env +srun cp ${RECHESSROOT}/CHESS/.env.rechess ${RECHESSROOT}/CHESS/.env + +# run preprocessing script +srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD + +# run the agent script +srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run_main_ir_ss_cg_rechess.sh ; kill %1; deactivate; cd $OLDPWD + +# gather results + +##srun /zpool1/ +##srun ls -la /lib +##srun ls -la /bin +##srun ls -la /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} +##srun ls -la ${OLPATH} +##srun ls -la ${OLPATH}/model + +# Remove all the data of the job to free memory on the node +srun rm -r /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} -- GitLab From b07ae7af2193ffdf254e2a3da168bd9af5e115e1 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 10 Feb 2025 12:18:11 +0100 Subject: [PATCH 02/12] Prepared "dry run" version to test batch script --- CHESS/run/run_main_ir_ss_cg_rechess.sh | 3 +++ scripts/batch.sh | 23 +++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) create mode 100755 CHESS/run/run_main_ir_ss_cg_rechess.sh diff --git a/CHESS/run/run_main_ir_ss_cg_rechess.sh b/CHESS/run/run_main_ir_ss_cg_rechess.sh new file mode 100755 index 0000000..9fda2c5 --- /dev/null +++ b/CHESS/run/run_main_ir_ss_cg_rechess.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "Running run_main_ir_ss_cg_rechess.sh..." diff --git a/scripts/batch.sh b/scripts/batch.sh index 3ed016f..9519fdd 100644 --- a/scripts/batch.sh +++ b/scripts/batch.sh @@ -10,19 +10,28 @@ KUERZEL=$1 REPOPATH=$2 +RESULTPATH=$3 # Check argurment number -if [ "$#" -ne 2 ]; then +if [ "$#" -ne 3 ]; then echo "Found incorrect number of arguments ($#)." - echo "Please one user shorthand and one path to the zip file of the re-CHESS repo:" - printf '\tbatch.sh <user-shorthand> <repo-path>\n' + echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" + printf '\tbatch.sh <user-shorthand> <repo-path> <result-dir-path>\n' exit 1 fi # Check if the second argument is an existing file if [ ! -f $2 ]; then - echo "File $2 does not exist." - echo " Please specify the path to the zip file of the re-CHESS repo:" + echo "File $2 does not exist or is not a file." + echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" + printf '\tbatch.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + +# Check if the third argument is an existing directory +if [ ! -d $3 ]; then + echo "Directory $3 does not exist or is not a directory." + echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" printf '\tbatch.sh <user-shorthand> <repo-path>\n' exit 1 fi @@ -79,12 +88,14 @@ srun source ${VENVPATH}/bin/activate ; pip3 install -r ${RECHESSROOT}/CHESS/requ srun cp ${RECHESSROOT}/CHESS/.env.rechess ${RECHESSROOT}/CHESS/.env # run preprocessing script -srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD +# srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD +srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; echo "\"Running preprocessing...\"" ; kill %1; deactivate; cd $OLDPWD # run the agent script srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run_main_ir_ss_cg_rechess.sh ; kill %1; deactivate; cd $OLDPWD # gather results +srun zip ${RESULTPATH}/results.zip ${RECHESSROOT}/CHESS/results/* ##srun /zpool1/ ##srun ls -la /lib -- GitLab From 0cc9ec684e2198ec67a3a0e3adfb2d4c0a992937 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Tue, 18 Feb 2025 16:08:49 +0100 Subject: [PATCH 03/12] Added some config for running on GPU-Cluster, progress on Slurm - needs fixed slurm scripts and docs - needs more config and docs matching to Slurm script - setup and run scripts for McGarret should be separated --- .../run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml | 60 +++++++++ .../configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml | 60 +++++++++ .../database_utils/db_catalog/preprocess.py | 3 +- CHESS/src/llm/engine_configs.py | 40 ++++-- .../tool_kit/retrieve_entity.py | 3 +- scripts/batch.sh | 54 +++++--- scripts/cleaner.sh | 25 ++++ scripts/copyrepo.sh | 118 ++++++++++++++++++ 8 files changed, 330 insertions(+), 33 deletions(-) create mode 100644 CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml create mode 100644 CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml create mode 100644 scripts/cleaner.sh create mode 100644 scripts/copyrepo.sh diff --git a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml new file mode 100644 index 0000000..81eb44a --- /dev/null +++ b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml @@ -0,0 +1,60 @@ +setting_name: CHESS_IR_SS_CG_BIRD_OSS + +team_agents: + information_retriever: + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' + tools: + extract_keywords: + template_name: 'extract_keywords' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.2 + parser_name: 'python_list_output_parser' + retrieve_entity: {} + retrieve_context: + top_k: 5 + + schema_selector: + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' + tools: + filter_column: + template_name: 'filter_column' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'filter_column' + + select_tables: + mode: 'ask_model' + template_name: 'select_tables' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'select_tables' + + select_columns: + mode: 'ask_model' + template_name: 'select_columns' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'select_columns' + + candidate_generator: + engine: 'finetuned_nl2sql' + tools: + generate_candidate: + generator_configs: + - template_name: 'generate_candidate_one' + engine_config: + engine_name: 'finetuned_nl2sql' + temperature: 0.01 + parser_name: 'generate_candidate_gemini_markdown_cot' + sampling_count: 1 + + revise: + template_name: 'revise_one' + engine_config: + engine_name: 'finetuned_nl2sql' + temperature: 0.0 + parser_name: 'revise_new' diff --git a/CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml b/CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml new file mode 100644 index 0000000..43e4d90 --- /dev/null +++ b/CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml @@ -0,0 +1,60 @@ +setting_name: CHESS_IR_SS_CG_SPIDER + +team_agents: + information_retriever: + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' + tools: + extract_keywords: + template_name: 'extract_keywords' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.2 + parser_name: 'python_list_output_parser' + retrieve_entity: {} + #retrieve_context: + #top_k: 5 + + schema_selector: + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' + tools: + filter_column: + template_name: 'filter_column' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'filter_column' + + select_tables: + mode: 'ask_model' + template_name: 'select_tables' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'select_tables' + + select_columns: + mode: 'ask_model' + template_name: 'select_columns' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'select_columns' + + candidate_generator: + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' + tools: + generate_candidate: + generator_configs: + - template_name: 'generate_candidate_one' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.01 + parser_name: 'generate_candidate_gemini_markdown_cot' + sampling_count: 1 + + revise: + template_name: 'revise_one' + engine_config: + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' + temperature: 0.0 + parser_name: 'revise_new' diff --git a/CHESS/src/database_utils/db_catalog/preprocess.py b/CHESS/src/database_utils/db_catalog/preprocess.py index a843fcf..27cfda5 100644 --- a/CHESS/src/database_utils/db_catalog/preprocess.py +++ b/CHESS/src/database_utils/db_catalog/preprocess.py @@ -30,7 +30,8 @@ if GCP_CREDENTIALS and GCP_PROJECT and GCP_REGION: # EMBEDDING_FUNCTION = VertexAIEmbeddings(model_name="text-embedding-004")#OpenAIEmbeddings(model="text-embedding-3-large") # EMBEDDING_FUNCTION = OpenAIEmbeddings(model="text-embedding-3-large") -EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3.2") +# EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3.2") +EMBEDDING_FUNCTION = OllamaEmbeddings(model="mxbai-embed-large") def make_db_context_vec_db(db_directory_path: str, **kwargs) -> None: diff --git a/CHESS/src/llm/engine_configs.py b/CHESS/src/llm/engine_configs.py index 4d2c6b8..78e03b9 100644 --- a/CHESS/src/llm/engine_configs.py +++ b/CHESS/src/llm/engine_configs.py @@ -98,26 +98,26 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = { "constructor": ChatAnthropic, "params": {"model": "claude-3-opus-20240229", "temperature": 0} }, - # "finetuned_nl2sql": { - # "constructor": ChatOpenAI, - # "params": { - # "model": "AI4DS/NL2SQL_DeepSeek_33B", - # "openai_api_key": "EMPTY", - # "openai_api_base": "/v1", - # "max_tokens": 400, - # "temperature": 0, - # "stop": ["```\n", ";"] - # } - # }, "finetuned_nl2sql": { "constructor": ChatOpenAI, "params": { - "model": "ft:gpt-4o-mini-2024-07-18:stanford-university::9p4f6Z4W", + "model": "AI4DS/NL2SQL_DeepSeek_33B", + "openai_api_key": "EMPTY", + "openai_api_base": "/v1", "max_tokens": 400, "temperature": 0, "stop": ["```\n", ";"] } }, + # "finetuned_nl2sql": { + # "constructor": ChatOpenAI, + # "params": { + # "model": "ft:gpt-4o-mini-2024-07-18:stanford-university::9p4f6Z4W", + # "max_tokens": 400, + # "temperature": 0, + # "stop": ["```\n", ";"] + # } + # }, "column_selection_finetuning": { "constructor": ChatOpenAI, "params": { @@ -166,9 +166,23 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = { "model": "llama3.2", "temperature": 0, "model_kwargs": { - "stop": [""], + "stop": ["<|eot_id|>"], }, "num_ctx": 32768 } } + # "meta-llama/llama3-2": { + # "constructor": ChatOpenAI, + # "params": { + # "model": "llama3.2", + # "openai_api_key": "EMPTY", + # "openai_api_base": "http://localhost:11434/v1", + # "max_tokens": 600, + # "temperature": 0 #, + # # "model_kwargs": { + # # "stop": [""] + # # } + # + # } + # } } diff --git a/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py b/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py index 8581537..d304eb4 100644 --- a/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py +++ b/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py @@ -33,7 +33,8 @@ class RetrieveEntity(Tool): def __init__(self): super().__init__() # self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-small") - self.embedding_function = OllamaEmbeddings(model="llama3.2") + # self.embedding_function = OllamaEmbeddings(model="llama3.2") + self.embedding_function = OllamaEmbeddings(model="nomic-embed-text") self.edit_distance_threshold = 0.3 self.embedding_similarity_threshold = 0.6 diff --git a/scripts/batch.sh b/scripts/batch.sh index 9519fdd..ac1421e 100644 --- a/scripts/batch.sh +++ b/scripts/batch.sh @@ -6,17 +6,24 @@ #SBATCH --mem=4G # reserved RAM #SBATCH --cpus-per-task=1 # number of CPU cores per task, alternative: --cpus-per-gpu #SBATCH --gres=gpu:0 # number of GPUs -#SBATCH --time=00:15:00 # Time limit for the entire job +#SBATCH --time=01:00:00 # Time limit for the entire job KUERZEL=$1 REPOPATH=$2 -RESULTPATH=$3 +MODELSPATH=$3 +RESULTPATH=$4 # Check argurment number -if [ "$#" -ne 3 ]; then +if [ "$#" -ne 4 ]; then echo "Found incorrect number of arguments ($#)." - echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" - printf '\tbatch.sh <user-shorthand> <repo-path> <result-dir-path>\n' + echo "Please specify:" + echo " - a user shorthand" + echo " - a re-CHESS repo zip file path (to copy from)" + echo " - a Ollama models zip file path (to copy from)" + #echo " - a BIRD dataset zip file path (to copy from)" + #echo " - a Spider dataset zip file path (to copy from)" + echo " - a results directory path (to copy to)" + printf '\tbatch.sh <user-shorthand> <repo-path> <models-path> <result-dir-path>\n' exit 1 fi @@ -28,8 +35,16 @@ if [ ! -f $2 ]; then exit 1 fi +# Check if the second argument is an existing file +if [ ! -f $3 ]; then + echo "File $3 does not exist or is not a file." + echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" + printf '\tbatch.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + # Check if the third argument is an existing directory -if [ ! -d $3 ]; then +if [ ! -d $4 ]; then echo "Directory $3 does not exist or is not a directory." echo "Please specify a user shorthand, a re-CHESS repo zip file path and a results directory path:" printf '\tbatch.sh <user-shorthand> <repo-path>\n' @@ -37,7 +52,7 @@ if [ ! -d $3 ]; then fi # Create user directory on /zpool1/slurm_data/ if it doesn't exist yet -srun if [ ! -d /zpool1/slurm_data/${KUERZEL} ]; then mkdir /zpool1/slurm_data/${KUERZEL}; fi +srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then srun rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) # Create directory for the job srun mkdir /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} @@ -47,26 +62,29 @@ RECHESSROOT=/zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess # Copy re-CHESS repo to the jobs folder and unpack sbcast $2 /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess.zip -srun unzip /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess.zip -d ${RECHESSROOT} +srun unzip /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID}/re-chess.zip -d /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} # Download and unpack Ollama -srun curl -L https://ollama.com/download/ollama-linux-amd64.tgz -O ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz +srun curl -f -s -S -L 'https://ollama.com/download/ollama-linux-amd64.tgz' -o "${RECHESSROOT}/ollama/ollama-linux-amd64.tgz" srun tar -xzvf ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz -C ${RECHESSROOT}/ollama OLPATH=${RECHESSROOT}/ollama OLLOGFILE=/dev/null -OLLAMA_MODELS=${OLPATH}/model +export OLLAMA_MODELS=${OLPATH}/models +export OLLAMA_KEEP_ALIVE=-1 -# Pull llama3:70b -srun ${OLPATH}/./bin/ollama serve & sleep 10 ; ${OLPATH}/./bin/ollama pull llama3:70b ; kill %1 +# Pull/copy llama3:70b +# srun ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'llama3:70b' ; kill %1 +sbcast $3 ${OLPATH}/models.zip +srun unzip ${OLPATH}/models.zip -d ${OLPATH} # Download the BIRD & Spider dataset BIRDPATH=$RECHESSROOT/CHESS/data/BIRD SPIDERPATH=$RECHESSROOT/CHESS/data/Spider -srun if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; fi -srun if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; fi -srun curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o ${BIRDPATH}/dev.zip -srun curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o ${BIRDPATH}/train.zip -srun curl -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o ${SPIDERPATH}/spider_data.zip +srun $(if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; fi) +srun $(if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; fi) +srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o ${BIRDPATH}/dev.zip +srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o ${BIRDPATH}/train.zip +srun curl -f -s -S -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o ${SPIDERPATH}/spider_data.zip # Unpack BIRD & Spider datasets srun unzip ${BIRDPATH}/dev.zip -d ${BIRDPATH} @@ -105,4 +123,4 @@ srun zip ${RESULTPATH}/results.zip ${RECHESSROOT}/CHESS/results/* ##srun ls -la ${OLPATH}/model # Remove all the data of the job to free memory on the node -srun rm -r /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} +srun rm -r /zpool1/slurm_data/${KUERZEL}/* diff --git a/scripts/cleaner.sh b/scripts/cleaner.sh new file mode 100644 index 0000000..a190294 --- /dev/null +++ b/scripts/cleaner.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH -e slurm-%j.err # error file-path +#SBATCH -o slurm-%j.out # output file-path +#SBATCH --nodes=1 # number of work nodes +#SBATCH --ntasks=1 # number of tasks +#SBATCH --mem=4G # reserved RAM +#SBATCH --cpus-per-task=1 # number of CPU cores per task, alternative: --cpus-per-gpu +#SBATCH --gres=gpu:0 # number of GPUs +#SBATCH --time=00:15:00 # Time limit for the entire job + +KUERZEL=$1 + +# Check argurment number +if [ "$#" -ne 1 ]; then + echo "Found incorrect number of arguments ($#)." + echo "Please specify a user shorthand:" + printf '\tcleaner.sh <user-shorthand>\n' + exit 1 +fi + + +# Remove previous stuff because script failed. +srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else echo "Nothing to remove."; fi) + +srun echo "Done." diff --git a/scripts/copyrepo.sh b/scripts/copyrepo.sh new file mode 100644 index 0000000..f0c1944 --- /dev/null +++ b/scripts/copyrepo.sh @@ -0,0 +1,118 @@ +#!/bin/bash +#SBATCH -e slurm-%j.err # error file-path +#SBATCH -o slurm-%j.out # output file-path +#SBATCH --nodes=1 # number of work nodes +#SBATCH --ntasks=1 # number of tasks +#SBATCH --mem=8G # reserved RAM +#SBATCH --cpus-per-task=2 # number of CPU cores per task, alternative: --cpus-per-gpu +#SBATCH --gres=gpu:0 # number of GPUs +#SBATCH --time=01:00:00 # Time limit for the entire job + +KUERZEL=$1 +REPOPATH=$2 + +# Check argurment number +if [ "$#" -ne 2 ]; then + echo "Found incorrect number of arguments ($#)." + echo "Please specify:" + echo " - a user shorthand" + echo " - a re-CHESS repo zip file path (to copy from)" + printf '\tcopyrepo.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + +# Check if the second argument is an existing file +if [ ! -f $2 ]; then + echo "File $2 does not exist or is not a file." + echo "Please specify a user shorthand and the re-CHESS repo zip file path:" + printf '\tbatch.sh <user-shorthand> <repo-path>\n' + exit 1 +fi + +# Create user directory on /zpool1/slurm_data/ if it doesn't exist yet +srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then echo 'not doing rm -r /zpool1/slurm_data/${KUERZEL}/*'; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) +#srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) + +# Create directory for the job +srun mkdir /zpool1/slurm_data/${KUERZEL}/re-chess + +RECHESSROOT=/zpool1/slurm_data/${KUERZEL}/re-chess + +# Copy re-CHESS repo to the jobs folder and unpack +sbcast $2 /zpool1/slurm_data/${KUERZEL}/re-chess.zip +srun unzip /zpool1/slurm_data/${KUERZEL}/re-chess.zip -d /zpool1/slurm_data/${KUERZEL} + +# Download and unpack Ollama +srun ls -Rla /zpool1/slurm_data/${KUERZEL} +srun mkdir /zpool1/slurm_data/${KUERZEL}/ollama +srun curl -f -s -S -L 'https://ollama.com/download/ollama-linux-amd64.tgz' -o "${RECHESSROOT}/ollama/ollama-linux-amd64.tgz" +srun tar -xzvf ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz -C ${RECHESSROOT}/ollama +OLPATH=${RECHESSROOT}/ollama +OLSLOGFILE=/dev/null # log file for ollama serve (all occurences) +OLPLOGFILE=/dev/null # log file for ollama pull (all occurences) +export OLLAMA_MODELS=${OLPATH}/models + +# Pull/copy llama3:70b +srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'llama3:70b' ; kill %1 ; echo "Llama3-70b pulled" +srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'mxbai-embed-large' ; kill %1 ; echo "mxbai-embed-large pulled" +srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'nomic-embed-text' ; kill %1 ; echo "nomic-embed-text pulled" +# sbcast $3 ${OLPATH}/models.zip +# srun unzip ${OLPATH}/models.zip -d ${OLPATH} + +# Download the BIRD & Spider dataset +BIRDPATH=${RECHESSROOT}/CHESS/data/BIRD +SPIDERPATH=${RECHESSROOT}/CHESS/data/Spider +srun ls -Rla /zpool1/slurm_data/${KUERZEL} +srun $(if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; else echo "Directory ${BIRDPATH} exists."; fi) +srun $(if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; else echo "Directory ${SPIDERPATH} exists."; fi) +srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o "${BIRDPATH}/dev.zip" +srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o "${BIRDPATH}/train.zip" +srun curl -f -s -S -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o "${SPIDERPATH}/spider_data.zip" + +# Unpack BIRD & Spider datasets +srun unzip ${BIRDPATH}/dev.zip -d ${BIRDPATH} +#srun unzip ${BIRDPATH}/train.zip -d ${BIRDPATH} +srun unzip ${SPIDERPATH}/spider_data.zip -d ${SPIDERPATH} + +# Unpack BIRD specific sub-directory +srun mv ${BIRDPATH}/dev_20240627 ${BIRDPATH}/dev +#srun rm ${BIRDPATH}/dev_20240627 +srun unzip ${BIRDPATH}/dev/dev_databases.zip -d ${BIRDPATH}/dev +#srun unzip ${BIRDPATH}/train/train_databases.zip -d ${BIRDPATH}/train + +# Create a Python venv +#srun py3versions --default +#srun py3versions --installed +PYTHONVER=$(py3versions --default) +srun ${PYTHONVER} -m venv ${RECHESSROOT}/CHESS/venv +VENVPATH=${RECHESSROOT}/CHESS/venv +OLDERPWD=${PWD} +source "${VENVPATH}/bin/activate" ; +srun which pip3.11 ; +deactivate +srun which pip3.11 ; +#srun cd ${RECHESSROOT}/CHESS ; source "${VENVPATH}/bin/activate" ; which pip3.11 ; deactivate ; cd ${OLDERPWD} +srun cd ${RECHESSROOT}/CHESS ; source "${VENVPATH}/bin/activate" ; pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" ; deactivate ; cd ${OLDERPWD} + +# copy the .env.rechess file to .env +srun cp ${RECHESSROOT}/CHESS/.env.rechess ${RECHESSROOT}/CHESS/.env + +# run preprocessing script +# srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD +srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10; echo "\"Running preprocessing...\"" ; kill %1; deactivate; cd $OLDPWD + +# run the agent script +srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run_main_ir_ss_cg_rechess.sh ; kill %1; deactivate; cd $OLDPWD + +# gather results +srun zip ${RESULTPATH}/results.zip ${RECHESSROOT}/CHESS/results/* + +##srun /zpool1/ +##srun ls -la /lib +##srun ls -la /bin +##srun ls -la /zpool1/slurm_data/${KUERZEL}/${SLURM_JOBID} +##srun ls -la ${OLPATH} +##srun ls -la ${OLPATH}/model + +# Remove all the data of the job to free memory on the node +#srun rm -r /zpool1/slurm_data/${KUERZEL}/* -- GitLab From 2190c706fd4f3793f2cb11e4bf18fe7af8d3fbda Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Fri, 21 Feb 2025 15:05:22 +0100 Subject: [PATCH 04/12] Gotten closer to running it on McG, vllm nl2sql model uses too much memory. - added slurm log files to .gitignore - renamed env file, setup to run with bird/sds - added vllm and its outlines dependency to requirements as mentioned in model_deployment.txt - renamed run script to bird-oss (open source software, refers to open source models) - changed cleanerscript to produce output - changed copyrepo.sh to allow for parallel execution of models and CHESS code - moved parallel sections of copyrepo to separate scripts --- .gitignore | 2 + CHESS/.env-rechess-bird-sds | 21 ++++ CHESS/requirements.txt | 2 + CHESS/run/run_main_ir_ss_cg_bird-oss.sh | 11 ++ CHESS/run/run_main_ir_ss_cg_rechess.sh | 3 - scripts/cleaner.sh | 4 +- scripts/copyrepo.sh | 132 ++++++++++++++++-------- scripts/ollamapull.sh | 14 +++ scripts/runchess.sh | 41 ++++++++ 9 files changed, 183 insertions(+), 47 deletions(-) create mode 100644 CHESS/.env-rechess-bird-sds create mode 100755 CHESS/run/run_main_ir_ss_cg_bird-oss.sh delete mode 100755 CHESS/run/run_main_ir_ss_cg_rechess.sh create mode 100755 scripts/ollamapull.sh create mode 100755 scripts/runchess.sh diff --git a/.gitignore b/.gitignore index 14fa2b9..bb45c1b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ ollama/* !ollama/models !ollama/models/.gitignore CHESS/.env.ollama3.2 +slurm*.err +slurm*.out diff --git a/CHESS/.env-rechess-bird-sds b/CHESS/.env-rechess-bird-sds new file mode 100644 index 0000000..9400a7f --- /dev/null +++ b/CHESS/.env-rechess-bird-sds @@ -0,0 +1,21 @@ +OPENAI_API_KEY="OPEN AI API KEY" + +DB_ROOT_PATH="./data/BIRD/dev" # this directory should be the parent of test_databases + +DATA_MODE="dev" +DATA_PATH="./data/BIRD/dev/sub_sampled_bird_dev_set.json" +DB_ROOT_DIRECTORY="./data/BIRD/dev/dev_databases" +DATA_TABLES_PATH="./data/BIRD/dev/dev_tables.json" +INDEX_SERVER_HOST='localhost' +INDEX_SERVER_PORT=12345 + +OPENAI_API_KEY='EMPTY' +GCP_PROJECT='' +GCP_REGION='us-central1' +GCP_CREDENTIALS='' +GOOGLE_CLOUD_PROJECT='' + +# PATH="$PATH:$PWD/ollama/bin" +# OLLAMA_HOST="127.0.0.1:11434" +# OLLAMA_MODELS="~/.ollama/models" + diff --git a/CHESS/requirements.txt b/CHESS/requirements.txt index 528c65e..456a230 100644 --- a/CHESS/requirements.txt +++ b/CHESS/requirements.txt @@ -27,3 +27,5 @@ filelock==3.15.4 faiss-cpu==1.8.0 datasets==2.21.0 pyyaml==6.0.2 +vllm==0.3.3 +outlines==0.0.33 diff --git a/CHESS/run/run_main_ir_ss_cg_bird-oss.sh b/CHESS/run/run_main_ir_ss_cg_bird-oss.sh new file mode 100755 index 0000000..e75c31b --- /dev/null +++ b/CHESS/run/run_main_ir_ss_cg_bird-oss.sh @@ -0,0 +1,11 @@ +source .env +data_mode=$DATA_MODE # Options: 'dev', 'train' +data_path=$DATA_PATH # UPDATE THIS WITH THE PATH TO THE TARGET DATASET + +config="./run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml" + +num_workers=1 # Number of workers to use for parallel processing, set to 1 for no parallel processing + +python3 -u ./src/main.py --data_mode ${data_mode} --data_path ${data_path} --config "$config" \ + --num_workers ${num_workers} --pick_final_sql true + diff --git a/CHESS/run/run_main_ir_ss_cg_rechess.sh b/CHESS/run/run_main_ir_ss_cg_rechess.sh deleted file mode 100755 index 9fda2c5..0000000 --- a/CHESS/run/run_main_ir_ss_cg_rechess.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "Running run_main_ir_ss_cg_rechess.sh..." diff --git a/scripts/cleaner.sh b/scripts/cleaner.sh index a190294..f46b3f4 100644 --- a/scripts/cleaner.sh +++ b/scripts/cleaner.sh @@ -20,6 +20,8 @@ fi # Remove previous stuff because script failed. -srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else echo "Nothing to remove."; fi) +#srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else echo "Nothing to remove."; fi) +srun rm -r /zpool1/slurm_data/${KUERZEL}/* +srun ls -la /zpool1/slurm_data/${KUERZEL}/* srun echo "Done." diff --git a/scripts/copyrepo.sh b/scripts/copyrepo.sh index f0c1944..ce44237 100644 --- a/scripts/copyrepo.sh +++ b/scripts/copyrepo.sh @@ -1,13 +1,18 @@ #!/bin/bash +#SBATCH --job-name rechess # name of the job #SBATCH -e slurm-%j.err # error file-path #SBATCH -o slurm-%j.out # output file-path #SBATCH --nodes=1 # number of work nodes -#SBATCH --ntasks=1 # number of tasks -#SBATCH --mem=8G # reserved RAM -#SBATCH --cpus-per-task=2 # number of CPU cores per task, alternative: --cpus-per-gpu -#SBATCH --gres=gpu:0 # number of GPUs +#SBATCH --ntasks=3 # number of tasks, we need to specify --ntasks=1 for use of | and & in bash +#SBATCH --mem=128G # reserved RAM +#SBATCH --cpus-per-task=4 # number of CPU cores per task +#SBATCH --gres=gpu:2 # number of GPUs #SBATCH --time=01:00:00 # Time limit for the entire job +## NOTE: --cpus-per-task is logical cores rather than physical (Multithreading)! +## Add --cpus-per-task=<even-no> or ${SLURM_} to each parallel task (preferred) +## Or disable multithreading: ##SBATCH --hint=nomultithread + KUERZEL=$1 REPOPATH=$2 @@ -30,82 +35,123 @@ if [ ! -f $2 ]; then fi # Create user directory on /zpool1/slurm_data/ if it doesn't exist yet -srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then echo 'not doing rm -r /zpool1/slurm_data/${KUERZEL}/*'; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) -#srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) +#srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then echo 'not doing rm -r /zpool1/slurm_data/${KUERZEL}/*'; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) +srun --ntasks=1 $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) # Create directory for the job -srun mkdir /zpool1/slurm_data/${KUERZEL}/re-chess +srun --ntasks=1 mkdir /zpool1/slurm_data/${KUERZEL}/re-chess RECHESSROOT=/zpool1/slurm_data/${KUERZEL}/re-chess # Copy re-CHESS repo to the jobs folder and unpack sbcast $2 /zpool1/slurm_data/${KUERZEL}/re-chess.zip -srun unzip /zpool1/slurm_data/${KUERZEL}/re-chess.zip -d /zpool1/slurm_data/${KUERZEL} +srun --ntasks=1 unzip /zpool1/slurm_data/${KUERZEL}/re-chess.zip -d /zpool1/slurm_data/${KUERZEL} # Download and unpack Ollama -srun ls -Rla /zpool1/slurm_data/${KUERZEL} -srun mkdir /zpool1/slurm_data/${KUERZEL}/ollama -srun curl -f -s -S -L 'https://ollama.com/download/ollama-linux-amd64.tgz' -o "${RECHESSROOT}/ollama/ollama-linux-amd64.tgz" -srun tar -xzvf ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz -C ${RECHESSROOT}/ollama +srun --ntasks=1 ls -la /zpool1/slurm_data/${KUERZEL} +srun --ntasks=1 mkdir /zpool1/slurm_data/${KUERZEL}/ollama +srun --ntasks=1 mkdir /zpool1/slurm_data/${KUERZEL}/vllm +srun --ntasks=1 curl -f -s -S -L 'https://ollama.com/download/ollama-linux-amd64.tgz' -o "${RECHESSROOT}/ollama/ollama-linux-amd64.tgz" +srun --ntasks=1 tar -xzvf ${RECHESSROOT}/ollama/ollama-linux-amd64.tgz -C ${RECHESSROOT}/ollama OLPATH=${RECHESSROOT}/ollama OLSLOGFILE=/dev/null # log file for ollama serve (all occurences) OLPLOGFILE=/dev/null # log file for ollama pull (all occurences) export OLLAMA_MODELS=${OLPATH}/models +export OLLAMA_KEEP_ALIVE=-1 # Pull/copy llama3:70b -srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'llama3:70b' ; kill %1 ; echo "Llama3-70b pulled" -srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'mxbai-embed-large' ; kill %1 ; echo "mxbai-embed-large pulled" -srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'nomic-embed-text' ; kill %1 ; echo "nomic-embed-text pulled" +srun --ntasks=1 --time='00:06:00' ollamapull.sh ${OLPATH} +#srun --exclusive --time='00:06:00' --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & +#TASKPID=$! +# sleep 10 +#srun --ntasks=4 sleep 10 & # this seems incredibly stupid, but it ain't stupid if it work, right? +#srun --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} sleep 10 +#wait %2 +# srun --ntasks=1 sleep 10 ; echo 'llama3:70b' 'mxbai-embed-large' 'nomic-embed-text' | xargs -n1 ${OLPATH}/./bin/ollama pull & +#srun --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama pull 'llama3:70b' & +#srun --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama pull 'mxbai-embed-large' & +#srun --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama pull 'nomic-embed-text' & +#wait %2 %3 %4 +#kill ${TASKPID} +# --dependency=after:1+1 ## resolution is 1 minute rounded up, job 1 was probably started at some point +# --time='00:06:00' ## download takes about 4.5 mins, increase to this with waiting +#srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'mxbai-embed-large' ; kill %1 ; echo "mxbai-embed-large pulled" +#srun ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10 ; ${OLPATH}/./bin/ollama pull 'nomic-embed-text' ; kill %1 ; echo "nomic-embed-text pulled" # sbcast $3 ${OLPATH}/models.zip # srun unzip ${OLPATH}/models.zip -d ${OLPATH} # Download the BIRD & Spider dataset BIRDPATH=${RECHESSROOT}/CHESS/data/BIRD SPIDERPATH=${RECHESSROOT}/CHESS/data/Spider -srun ls -Rla /zpool1/slurm_data/${KUERZEL} -srun $(if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; else echo "Directory ${BIRDPATH} exists."; fi) -srun $(if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; else echo "Directory ${SPIDERPATH} exists."; fi) -srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o "${BIRDPATH}/dev.zip" -srun curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o "${BIRDPATH}/train.zip" -srun curl -f -s -S -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o "${SPIDERPATH}/spider_data.zip" +SPIDERTWOPATH=${RECHESSROOT}/CHESS/data/Spider2 +srun --ntasks=1 ls -la /zpool1/slurm_data/${KUERZEL} +srun --ntasks=1 $(if [ ! -d ${BIRDPATH} ]; then mkdir ${BIRDPATH}; else echo "Directory ${BIRDPATH} exists."; fi) +srun --ntasks=1 $(if [ ! -d ${SPIDERPATH} ]; then mkdir ${SPIDERPATH}; else echo "Directory ${SPIDERPATH} exists."; fi) +# srun --ntasks=1 $(if [ ! -d ${SPIDERTWOPATH} ]; then mkdir ${SPIDERPATH}; else echo "Directory ${SPIDERPATH} exists."; fi) +srun --ntasks=1 curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o "${BIRDPATH}/dev.zip" +#srun --ntasks=1 curl -f -s -S -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o "${BIRDPATH}/train.zip" +srun --ntasks=1 curl -f -s -S -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o "${SPIDERPATH}/spider_data.zip" +#srun --ntasks=1 curl -f -s -S -L 'https://drive.usercontent.google.com/download?id=1coEVsCZq-Xvj9p2TnhBFoFTsY-UoYGmG&confirm=t' -o '${SPIDERTWOPATH}/local_sqlite.zip' # for spider 2.0 # Unpack BIRD & Spider datasets -srun unzip ${BIRDPATH}/dev.zip -d ${BIRDPATH} -#srun unzip ${BIRDPATH}/train.zip -d ${BIRDPATH} -srun unzip ${SPIDERPATH}/spider_data.zip -d ${SPIDERPATH} +srun --ntasks=1 unzip ${BIRDPATH}/dev.zip -d ${BIRDPATH} +#srun --ntasks=1 unzip ${BIRDPATH}/train.zip -d ${BIRDPATH} +srun --ntasks=1 unzip ${SPIDERPATH}/spider_data.zip -d ${SPIDERPATH} # Unpack BIRD specific sub-directory -srun mv ${BIRDPATH}/dev_20240627 ${BIRDPATH}/dev -#srun rm ${BIRDPATH}/dev_20240627 -srun unzip ${BIRDPATH}/dev/dev_databases.zip -d ${BIRDPATH}/dev -#srun unzip ${BIRDPATH}/train/train_databases.zip -d ${BIRDPATH}/train - -# Create a Python venv -#srun py3versions --default -#srun py3versions --installed +srun --ntasks=1 mv ${BIRDPATH}/dev_20240627 ${BIRDPATH}/dev +#srun --ntasks=1 rm ${BIRDPATH}/dev_20240627 +srun --ntasks=1 unzip ${BIRDPATH}/dev/dev_databases.zip -d ${BIRDPATH}/dev +#srun --ntasks=1 unzip ${BIRDPATH}/train/train_databases.zip -d ${BIRDPATH}/train +srun --ntasks=1 cp "${RECHESSROOT}/CHESS/data/dev/sub_sampled_bird_dev_set.json" "${BIRDPATH}/dev/sub_sampled_bird_dev_set.json" + +# Unpack Spider 2.0 specific sub-directory +# TODO, dataset spread out over download above and github repo... + +# Create a Python venv and install requirements +#srun --ntasks=1 py3versions --default +#srun --ntasks=1 py3versions --installed PYTHONVER=$(py3versions --default) -srun ${PYTHONVER} -m venv ${RECHESSROOT}/CHESS/venv +srun --ntasks=1 ${PYTHONVER} -m venv ${RECHESSROOT}/CHESS/venv VENVPATH=${RECHESSROOT}/CHESS/venv OLDERPWD=${PWD} source "${VENVPATH}/bin/activate" ; -srun which pip3.11 ; +# srun which pip3.11 ; # for debugging purposes +srun --ntasks=1 pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" + +# Load nl2sql model via vllm +srun --time='00:05:00' --ntasks=1 ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --dtype float32 --download-dir "${RECHESSROOT}/vllm" +srun --ntasks=1 ls -la "${VENVPATH}/lib/${PYTHONVER}/site-packages" deactivate -srun which pip3.11 ; +# srun which pip3.11 ; # for debugging purposes #srun cd ${RECHESSROOT}/CHESS ; source "${VENVPATH}/bin/activate" ; which pip3.11 ; deactivate ; cd ${OLDERPWD} -srun cd ${RECHESSROOT}/CHESS ; source "${VENVPATH}/bin/activate" ; pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" ; deactivate ; cd ${OLDERPWD} +#srun cd ${RECHESSROOT}/CHESS ; source "${VENVPATH}/bin/activate" ; pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" ; deactivate ; cd ${OLDERPWD} + # copy the .env.rechess file to .env -srun cp ${RECHESSROOT}/CHESS/.env.rechess ${RECHESSROOT}/CHESS/.env +srun --ntasks=1 cp ${RECHESSROOT}/CHESS/.env-rechess-bird-sds ${RECHESSROOT}/CHESS/.env +#srun cp ${RECHESSROOT}/CHESS/.env-rechess-spider ${RECHESSROOT}/CHESS/.env # run preprocessing script # srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD -srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10; echo "\"Running preprocessing...\"" ; kill %1; deactivate; cd $OLDPWD - +srun --ntasks=1 --time='00:15:00' runchess.sh ${RECHESSROOT} ${PYTHONVER} + +#source ${VENVPATH}/bin/activate +#cd ${RECHESSROOT}/CHESS +#srun --time='00:15:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & +#srun --time='00:15:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "meta-llama/Meta-Llama-3-70B-Instruct" --dtype float32 --download-dir "${RECHESSROOT}/vllm" & +#srun --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} sleep 10 & +#wait %3 +#srun --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} run/./run_preprocess.sh & # run the agent script -srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLLOGFILE} & sleep 10; /run/./run_main_ir_ss_cg_rechess.sh ; kill %1; deactivate; cd $OLDPWD +#wait %3 +#srun --time='00:02:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} run/run_main_ir_ss_cg_bird-oss.sh & +#wait +#deactivate +#cd $OLDERPWD -# gather results -srun zip ${RESULTPATH}/results.zip ${RECHESSROOT}/CHESS/results/* +# gather results, potentially breaks because of filenames starting with '-' ? +srun --ntasks=1 zip -r ${PWD}/results.zip ${RECHESSROOT}/CHESS/results/* ##srun /zpool1/ ##srun ls -la /lib @@ -115,4 +161,4 @@ srun zip ${RESULTPATH}/results.zip ${RECHESSROOT}/CHESS/results/* ##srun ls -la ${OLPATH}/model # Remove all the data of the job to free memory on the node -#srun rm -r /zpool1/slurm_data/${KUERZEL}/* +srun --ntasks=1 rm -r /zpool1/slurm_data/${KUERZEL}/* diff --git a/scripts/ollamapull.sh b/scripts/ollamapull.sh new file mode 100755 index 0000000..03f0956 --- /dev/null +++ b/scripts/ollamapull.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +OLPATH=$1 +OLLOGPULL='/dev/null' +OLLOGSERVE='/dev/null' + +${OLPATH}/bin/ollama serve &> ${OLLOGSERVE} & + +sleep 10 +${OLPATH}/bin/ollama pull 'llama3:70b' &> ${OLLOGPULL} +${OLPATH}/bin/ollama pull 'mxbai-embed-large' &> ${OLLOGPULL} +${OLPATH}/bin/ollama pull 'nomic-embed-text' &> ${OLLOGPULL} + +kill %1 diff --git a/scripts/runchess.sh b/scripts/runchess.sh new file mode 100755 index 0000000..69c13c1 --- /dev/null +++ b/scripts/runchess.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +RECHESSROOT=$1 +PYTHONVER=$2 +# OLLAMA_MODELS=${RECHESSROOT}/ollama/models +OLDERPWD=${PWD} +OLLOGSERVE="/dev/null" + + +source ${RECHESSROOT}/CHESS/venv/bin/activate +cd ${RECHESSROOT}/CHESS + +nvidia-smi -L +gpulist=$(nvidia-smi -L | cut -c 5-5 | tr '\n' ' ') +gpuzero=$(echo $gpulist | cut -c 1-1) +gpuone=$(echo $gpulist | cut -c 3-3) +echo $gpulist ' - ' $gpuzero ' - ' $gpuone + +CUDA_VISIBLE_DEVICES=$gpuone +${RECHESSROOT}/ollama/bin/ollama serve & + +date; +sleep 30; +date; +CUDA_VISIBLE_DEVICES=$gpuzero +${PYTHONVER} -m vllm.entrypoints.openai.api_server --gpu-memory-utilization 0.8 --model "AI4DS/NL2SQL_DeepSeek_33B" --dtype float32 --download-dir "${RECHESSROOT}/vllm" & + +date; +sleep 30; +date; + +run/run_preprocess.sh + +# Select the predefined script +run/run_main_ir_ss_cg_bird-oss.sh +# run/run_main_ir_ss_cg_spider-oss.sh +# run/run_main_ir_ss_cg_spider2-oss.sh + +kill %1 %2 +cd ${OLDERPWD} +deactivate -- GitLab From 89868cdcb87e8df61ca739d09c59966ded4cca93 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 3 Mar 2025 12:36:31 +0100 Subject: [PATCH 05/12] Fixed some configs, run script things --- .../run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml | 14 ++++--- CHESS/src/llm/engine_configs.py | 37 +++++++++++-------- scripts/copyrepo.sh | 34 +++++++++++------ scripts/runchess.sh | 4 +- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml index 81eb44a..7d045b2 100644 --- a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml +++ b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml @@ -41,20 +41,22 @@ team_agents: parser_name: 'select_columns' candidate_generator: - engine: 'finetuned_nl2sql' + engine: 'meta-llama/Meta-Llama-3-70B-Instruct' tools: generate_candidate: generator_configs: - - template_name: 'generate_candidate_one' + - template_name: 'generate_candidate_finetuned' + # - template_name: 'generate_candidate_one' engine_config: engine_name: 'finetuned_nl2sql' temperature: 0.01 - parser_name: 'generate_candidate_gemini_markdown_cot' + #parser_name: 'generate_candidate_gemini_markdown_cot' + parser_name: 'generated_candidate_finetuned' sampling_count: 1 revise: - template_name: 'revise_one' + template_name: 'revise_two' engine_config: - engine_name: 'finetuned_nl2sql' + engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' temperature: 0.0 - parser_name: 'revise_new' + parser_name: 'revise' diff --git a/CHESS/src/llm/engine_configs.py b/CHESS/src/llm/engine_configs.py index 78e03b9..68accd9 100644 --- a/CHESS/src/llm/engine_configs.py +++ b/CHESS/src/llm/engine_configs.py @@ -103,7 +103,7 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = { "params": { "model": "AI4DS/NL2SQL_DeepSeek_33B", "openai_api_key": "EMPTY", - "openai_api_base": "/v1", + "openai_api_base": "http://localhost:8000/v1", "max_tokens": 400, "temperature": 0, "stop": ["```\n", ";"] @@ -147,28 +147,33 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = { # "stop": [";"] # } # }, + # "meta-llama/Meta-Llama-3-70B-Instruct": { + # "constructor": ChatOpenAI, + # "params": { + # "model": "meta-llama/Meta-Llama-3-70B-Instruct", + # "openai_api_key": "EMPTY", + # "openai_api_base": "/v1", + # "max_tokens": 600, + # "temperature": 0, + # "model_kwargs": { + # "stop": [""] + # } + # } + # }, "meta-llama/Meta-Llama-3-70B-Instruct": { - "constructor": ChatOpenAI, - "params": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "openai_api_key": "EMPTY", - "openai_api_base": "/v1", - "max_tokens": 600, - "temperature": 0, - "model_kwargs": { - "stop": [""] - } - } - }, - "meta-llama/llama3-2": { "constructor": ChatOllama, + #"constructor": ChatOpenAI, "params": { - "model": "llama3.2", + "model": "llama3:70b", + #"openai_api_key": "EMPTY", + #"openai_api_base": "http://localhost:11434/v1", + "max_tokens": 600, + #"num_predict": 128, # Should be equiv to max_tokens, Ollama is 128 "temperature": 0, "model_kwargs": { "stop": ["<|eot_id|>"], }, - "num_ctx": 32768 + "num_ctx": 8192 } } # "meta-llama/llama3-2": { diff --git a/scripts/copyrepo.sh b/scripts/copyrepo.sh index ce44237..cd423b3 100644 --- a/scripts/copyrepo.sh +++ b/scripts/copyrepo.sh @@ -7,11 +7,14 @@ #SBATCH --mem=128G # reserved RAM #SBATCH --cpus-per-task=4 # number of CPU cores per task #SBATCH --gres=gpu:2 # number of GPUs -#SBATCH --time=01:00:00 # Time limit for the entire job +#SBATCH --time=2-02:00:00 # Time limit for the entire job (increased to 2d 2h) +#SBATCH --nodelist=workg02 # The 80 GB A100 GPUs are on node workg02 ## NOTE: --cpus-per-task is logical cores rather than physical (Multithreading)! -## Add --cpus-per-task=<even-no> or ${SLURM_} to each parallel task (preferred) +## Add --cpus-per-task=<even-no> or ${SLURM_CPUS_PER_TASK} to each parallel task (preferred) ## Or disable multithreading: ##SBATCH --hint=nomultithread +## Switched to separate scripts and bash jobs for concurrent jobs +## This version is not to be run in parallel to another slurm batch job with the same script! (copyrepo.sh) KUERZEL=$1 REPOPATH=$2 @@ -30,12 +33,16 @@ fi if [ ! -f $2 ]; then echo "File $2 does not exist or is not a file." echo "Please specify a user shorthand and the re-CHESS repo zip file path:" - printf '\tbatch.sh <user-shorthand> <repo-path>\n' + printf '\tcopyrepo.sh <user-shorthand> <repo-path>\n' exit 1 fi +# Useful debug Information +srun echo "Tested: <running CHESS on SDS-BIRD>" # place a little reminder on what was tested here. +srun hostname +srun nvidia-smi + # Create user directory on /zpool1/slurm_data/ if it doesn't exist yet -#srun $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then echo 'not doing rm -r /zpool1/slurm_data/${KUERZEL}/*'; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) srun --ntasks=1 $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) # Create directory for the job @@ -43,7 +50,7 @@ srun --ntasks=1 mkdir /zpool1/slurm_data/${KUERZEL}/re-chess RECHESSROOT=/zpool1/slurm_data/${KUERZEL}/re-chess -# Copy re-CHESS repo to the jobs folder and unpack +# Copy re-CHESS repo to the directory in /zpool1/slurm_data/ and unpack sbcast $2 /zpool1/slurm_data/${KUERZEL}/re-chess.zip srun --ntasks=1 unzip /zpool1/slurm_data/${KUERZEL}/re-chess.zip -d /zpool1/slurm_data/${KUERZEL} @@ -59,8 +66,8 @@ OLPLOGFILE=/dev/null # log file for ollama pull (all occurences) export OLLAMA_MODELS=${OLPATH}/models export OLLAMA_KEEP_ALIVE=-1 -# Pull/copy llama3:70b -srun --ntasks=1 --time='00:06:00' ollamapull.sh ${OLPATH} +# Pull/copy llama3:70b, typically takes 4.5 to 5 mins +srun --ntasks=1 --time='00:07:00' ollamapull.sh ${OLPATH} #srun --exclusive --time='00:06:00' --ntasks=1 --nodes=${SLURM_NNODES} --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & #TASKPID=$! # sleep 10 @@ -120,7 +127,7 @@ source "${VENVPATH}/bin/activate" ; srun --ntasks=1 pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" # Load nl2sql model via vllm -srun --time='00:05:00' --ntasks=1 ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --dtype float32 --download-dir "${RECHESSROOT}/vllm" +srun --time='00:05:00' --ntasks=1 ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 16384 --download-dir "${RECHESSROOT}/vllm" srun --ntasks=1 ls -la "${VENVPATH}/lib/${PYTHONVER}/site-packages" deactivate # srun which pip3.11 ; # for debugging purposes @@ -134,12 +141,12 @@ srun --ntasks=1 cp ${RECHESSROOT}/CHESS/.env-rechess-bird-sds ${RECHESSROOT}/CHE # run preprocessing script # srun cd ${RECHESSROOT}; source ${VENVPATH}/bin/activate; ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & sleep 10; /run/./run-preprocess.sh ; kill %1; deactivate; cd $OLDPWD -srun --ntasks=1 --time='00:15:00' runchess.sh ${RECHESSROOT} ${PYTHONVER} +srun --ntasks=1 --time='2-00:00:00' runchess.sh ${RECHESSROOT} ${PYTHONVER} #source ${VENVPATH}/bin/activate #cd ${RECHESSROOT}/CHESS -#srun --time='00:15:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & -#srun --time='00:15:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "meta-llama/Meta-Llama-3-70B-Instruct" --dtype float32 --download-dir "${RECHESSROOT}/vllm" & +#srun --time='10:00:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${OLPATH}/./bin/ollama serve &> ${OLSLOGFILE} & +#srun --time='10:00:00' --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "meta-llama/Meta-Llama-3-70B-Instruct" --dtype float32 --download-dir "${RECHESSROOT}/vllm" & #srun --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} sleep 10 & #wait %3 #srun --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} run/./run_preprocess.sh & @@ -151,7 +158,10 @@ srun --ntasks=1 --time='00:15:00' runchess.sh ${RECHESSROOT} ${PYTHONVER} #cd $OLDERPWD # gather results, potentially breaks because of filenames starting with '-' ? -srun --ntasks=1 zip -r ${PWD}/results.zip ${RECHESSROOT}/CHESS/results/* +OLDERPWD=${PWD} +cd ${RECHESSROOT}/CHESS +srun --ntasks=1 zip -r ${OLDERPWD}/results-${SLURM_JOB_ID}.zip results/* +cd ${OLDERPWD} ##srun /zpool1/ ##srun ls -la /lib diff --git a/scripts/runchess.sh b/scripts/runchess.sh index 69c13c1..e62855f 100755 --- a/scripts/runchess.sh +++ b/scripts/runchess.sh @@ -5,6 +5,7 @@ PYTHONVER=$2 # OLLAMA_MODELS=${RECHESSROOT}/ollama/models OLDERPWD=${PWD} OLLOGSERVE="/dev/null" +VLLMLOGDUMP="/dev/null" source ${RECHESSROOT}/CHESS/venv/bin/activate @@ -23,7 +24,8 @@ date; sleep 30; date; CUDA_VISIBLE_DEVICES=$gpuzero -${PYTHONVER} -m vllm.entrypoints.openai.api_server --gpu-memory-utilization 0.8 --model "AI4DS/NL2SQL_DeepSeek_33B" --dtype float32 --download-dir "${RECHESSROOT}/vllm" & +#${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 8192 --download-dir "${RECHESSROOT}/vllm" & +${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 8192 --download-dir "${RECHESSROOT}/vllm" 1> ${VLLMLOGDUMP} & date; sleep 30; -- GitLab From d21b36cc56f4425726683572f62b00687e880898 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 3 Mar 2025 12:44:59 +0100 Subject: [PATCH 06/12] Added sds in data subdir --- CHESS/data/dev/sub_sampled_bird_dev_set.json | 1178 ++++++++++++++++++ 1 file changed, 1178 insertions(+) create mode 100644 CHESS/data/dev/sub_sampled_bird_dev_set.json diff --git a/CHESS/data/dev/sub_sampled_bird_dev_set.json b/CHESS/data/dev/sub_sampled_bird_dev_set.json new file mode 100644 index 0000000..4b3fa3a --- /dev/null +++ b/CHESS/data/dev/sub_sampled_bird_dev_set.json @@ -0,0 +1,1178 @@ +[ + { + "question_id": 9, + "db_id": "california_schools", + "question": "Among the schools with the average score in Math over 560 in the SAT test, how many schools are directly charter-funded?", + "evidence": "", + "SQL": "SELECT COUNT(T2.`School Code`) FROM satscores AS T1 INNER JOIN frpm AS T2 ON T1.cds = T2.CDSCode WHERE T1.AvgScrMath > 560 AND T2.`Charter Funding Type` = 'Directly funded'", + "difficulty": "simple" + }, + { + "question_id": 21, + "db_id": "california_schools", + "question": "In Los Angeles how many schools have more than 500 free meals but less than 700 free or reduced price meals for K-12?", + "evidence": "", + "SQL": "SELECT COUNT(CDSCode) FROM frpm WHERE `County Name` = 'Los Angeles' AND `Free Meal Count (K-12)` > 500 AND `Free Meal Count (K-12)` < 700", + "difficulty": "simple" + }, + { + "question_id": 25, + "db_id": "california_schools", + "question": "Name schools in Riverside which the average of average math score for SAT is grater than 400, what is the funding type of these schools?", + "evidence": "Average of average math = sum(average math scores) / count(schools).", + "SQL": "SELECT T1.sname, T2.`Charter Funding Type` FROM satscores AS T1 INNER JOIN frpm AS T2 ON T1.cds = T2.CDSCode WHERE T2.`District Name` LIKE 'Riverside%' GROUP BY T1.sname, T2.`Charter Funding Type` HAVING CAST(SUM(T1.AvgScrMath) AS REAL) / COUNT(T1.cds) > 400", + "difficulty": "moderate" + }, + { + "question_id": 35, + "db_id": "california_schools", + "question": "What is the administrator's email address of the chartered school with the fewest students enrolled in grades 1 through 12?", + "evidence": "Charted school means `Charter School (Y/N)` = 1 in the table frpm; Students enrolled in grades 1 through 12 refers to `Enrollment (K-12)`", + "SQL": "SELECT T2.AdmEmail1 FROM frpm AS T1 INNER JOIN schools AS T2 ON T1.CDSCode = T2.CDSCode WHERE T1.`Charter School (Y/N)` = 1 ORDER BY T1.`Enrollment (K-12)` ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 40, + "db_id": "california_schools", + "question": "What is the telephone number for the school with the lowest average score in reading in Fresno Unified?", + "evidence": "Fresno Unified is a name of district;", + "SQL": "SELECT T2.Phone FROM satscores AS T1 INNER JOIN schools AS T2 ON T1.cds = T2.CDSCode WHERE T2.District = 'Fresno Unified' AND T1.AvgScrRead IS NOT NULL ORDER BY T1.AvgScrRead ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 43, + "db_id": "california_schools", + "question": "What is the average math score of the school with the lowest average score for all subjects, and in which county is it located?", + "evidence": "Average score for all subjects can be computed by AvgScrMath + AvgScrRead + AvgScrWrite", + "SQL": "SELECT T1.AvgScrMath, T2.County FROM satscores AS T1 INNER JOIN schools AS T2 ON T1.cds = T2.CDSCode WHERE T1.AvgScrMath IS NOT NULL ORDER BY T1.AvgScrMath + T1.AvgScrRead + T1.AvgScrWrite ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 76, + "db_id": "california_schools", + "question": "What is the city location of the high school level school with Lunch Provision 2 whose lowest grade is 9 and the highest grade is 12 in the county of Merced?", + "evidence": "High school can be represented as EILCode = 'HS'", + "SQL": "SELECT T2.City FROM frpm AS T1 INNER JOIN schools AS T2 ON T1.CDSCode = T2.CDSCode WHERE T1.`NSLP Provision Status` = 'Lunch Provision 2' AND T2.County = 'Merced' AND T1.`Low Grade` = 9 AND T1.`High Grade` = 12 AND T2.EILCode = 'HS'", + "difficulty": "moderate" + }, + { + "question_id": 81, + "db_id": "california_schools", + "question": "In which city can you find the school in the state of California with the lowest latitude coordinates and what is its lowest grade? Indicate the school name.", + "evidence": "State of California refers to state = 'CA'", + "SQL": "SELECT T2.City, T1.`Low Grade`, T1.`School Name` FROM frpm AS T1 INNER JOIN schools AS T2 ON T1.CDSCode = T2.CDSCode WHERE T2.State = 'CA' ORDER BY T2.Latitude ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 93, + "db_id": "financial", + "question": "How many male customers who are living in North Bohemia have average salary greater than 8000?", + "evidence": "Male means that gender = 'M'; A3 refers to region; A11 pertains to average salary.", + "SQL": "SELECT COUNT(T1.client_id) FROM client AS T1 INNER JOIN district AS T2 ON T1.district_id = T2.district_id WHERE T1.gender = 'M' AND T2.A3 = 'North Bohemia' AND T2.A11 > 8000", + "difficulty": "moderate" + }, + { + "question_id": 97, + "db_id": "financial", + "question": "List out the clients who choose statement of issuance after transaction are Disponent?", + "evidence": "'POPLATEK PO OBRATU' stands for issuance after transaction", + "SQL": "SELECT T2.client_id FROM account AS T1 INNER JOIN disp AS T2 ON T1.account_id = T2.account_id WHERE T1.frequency = 'POPLATEK PO OBRATU' AND T2.type = 'DISPONENT'", + "difficulty": "simple" + }, + { + "question_id": 98, + "db_id": "financial", + "question": "Among the accounts who have approved loan date in 1997, list out the accounts that have the lowest approved amount and choose weekly issuance statement.", + "evidence": "'POPLATEK TYDNE' stands for weekly issuance", + "SQL": "SELECT T2.account_id FROM loan AS T1 INNER JOIN account AS T2 ON T1.account_id = T2.account_id WHERE STRFTIME('%Y', T1.date) = '1997' AND T2.frequency = 'POPLATEK TYDNE' ORDER BY T1.amount LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 111, + "db_id": "financial", + "question": "How many accounts were opened in Litomerice in 1996?", + "evidence": "A2 refers to district name; Litomerice is one of district names.", + "SQL": "SELECT COUNT(T2.account_id) FROM district AS T1 INNER JOIN account AS T2 ON T1.district_id = T2.district_id WHERE STRFTIME('%Y', T2.date) = '1996' AND T1.A2 = 'Litomerice'", + "difficulty": "simple" + }, + { + "question_id": 130, + "db_id": "financial", + "question": "How many of the account holders in South Bohemia still do not own credit cards?", + "evidence": "A3 contains the region names; South Bohemia is one of region names.", + "SQL": "SELECT COUNT(T3.account_id) FROM district AS T1 INNER JOIN client AS T2 ON T1.district_id = T2.district_id INNER JOIN disp AS T3 ON T2.client_id = T3.client_id WHERE T1.A3 = 'south Bohemia' AND T3.type != 'OWNER'", + "difficulty": "moderate" + }, + { + "question_id": 139, + "db_id": "financial", + "question": "How many high-level credit cards have \"disponent\" type of disposition?", + "evidence": "High-level credit cards refers to the cards with the gold type.", + "SQL": "SELECT COUNT(T1.card_id) FROM card AS T1 INNER JOIN disp AS T2 ON T1.disp_id = T2.disp_id WHERE T1.type = 'gold' AND T2.type = 'DISPONENT'", + "difficulty": "simple" + }, + { + "question_id": 158, + "db_id": "financial", + "question": "What is the district Id of the account that placed the order with the id 33333?", + "evidence": "", + "SQL": "SELECT T3.district_id FROM `order` AS T1 INNER JOIN account AS T2 ON T1.account_id = T2.account_id INNER JOIN district AS T3 ON T2.district_id = T3.district_id WHERE T1.order_id = 33333", + "difficulty": "simple" + }, + { + "question_id": 164, + "db_id": "financial", + "question": "Who placed the order with the id 32423?", + "evidence": "", + "SQL": "SELECT T3.client_id FROM `order` AS T1 INNER JOIN account AS T2 ON T1.account_id = T2.account_id INNER JOIN client AS T3 ON T2.district_id = T3.district_id WHERE T1.order_id = 32423", + "difficulty": "simple" + }, + { + "question_id": 176, + "db_id": "financial", + "question": "What is the amount of debt that client number 992 has, and how is this client doing with payments?", + "evidence": "", + "SQL": "SELECT T3.amount, T3.status FROM client AS T1 INNER JOIN account AS T2 ON T1.district_id = T2.district_id INNER JOIN loan AS T3 ON T2.account_id = T3.account_id WHERE T1.client_id = 992", + "difficulty": "simple" + }, + { + "question_id": 189, + "db_id": "financial", + "question": "Name the account numbers of female clients who are oldest and have lowest average salary?", + "evidence": "Female refers to 'F' in the gender; A11 contains information about average salary", + "SQL": "SELECT T3.account_id FROM client AS T1 INNER JOIN district AS T2 ON T1.district_id = T2.district_id INNER JOIN account AS T3 ON T2.district_id = T3.district_id WHERE T1.gender = 'F' ORDER BY T1.birth_date ASC, T2.A11 ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 231, + "db_id": "toxicology", + "question": "Which bond type accounted for the majority of the bonds found in molecule TR018 and state whether or not this molecule is carcinogenic?", + "evidence": "TR018 is the molecule id; majority of the bond found refers to MAX(COUNT(bond_type)); label = '+' mean molecules are carcinogenic; label = '-' means molecules are non-carcinogenic", + "SQL": "SELECT T.bond_type FROM ( SELECT T1.bond_type, COUNT(T1.molecule_id) FROM bond AS T1 WHERE T1.molecule_id = 'TR018' GROUP BY T1.bond_type ORDER BY COUNT(T1.molecule_id) DESC LIMIT 1 ) AS T", + "difficulty": "challenging" + }, + { + "question_id": 240, + "db_id": "toxicology", + "question": "List all the elements of the toxicology of the molecule \"TR004\".", + "evidence": "TR004 is the molecule id; element = 'cl' means Chlorine; element = 'c' means Carbon; element = 'h' means Hydrogen; element = 'o' means Oxygen, element = 's' means Sulfur; element = 'n' means Nitrogen, element = 'p' means Phosphorus, element = 'na' means Sodium, element = 'br' means Bromine, element = 'f' means Fluorine; element = 'i' means Iodine; element = 'sn' means Tin; element = 'pb' means Lead; element = 'te' means Tellurium; element = 'ca' means Calcium", + "SQL": "SELECT DISTINCT T.element FROM atom AS T WHERE T.molecule_id = 'TR004'", + "difficulty": "challenging" + }, + { + "question_id": 257, + "db_id": "toxicology", + "question": "List down atom id2 for atoms with element sulfur.", + "evidence": "element sulfur refers to element = 's'", + "SQL": "SELECT DISTINCT T2.atom_id2 FROM atom AS T1 INNER JOIN connected AS T2 ON T1.atom_id = T2.atom_id WHERE T1.element = 's'", + "difficulty": "simple" + }, + { + "question_id": 267, + "db_id": "toxicology", + "question": "List down the bond type for molecules from molecule id TR000 to TR050.", + "evidence": "double bond refers to bond_type = ' = '; single bond refers to bond_type = '-'; triple bond refers to bond_type = '#';", + "SQL": "SELECT T2.molecule_id, T2.bond_type FROM molecule AS T1 INNER JOIN bond AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.molecule_id BETWEEN 'TR000' AND 'TR050'", + "difficulty": "moderate" + }, + { + "question_id": 286, + "db_id": "toxicology", + "question": "Among all chemical compounds identified in the database, what percent of compounds form a triple-bond.", + "evidence": "triple bond refers to bond_type = '#';", + "SQL": "SELECT CAST(COUNT(CASE WHEN T.bond_type = '#' THEN T.bond_id ELSE NULL END) AS REAL) * 100 / COUNT(T.bond_id) FROM bond AS T", + "difficulty": "simple" + }, + { + "question_id": 287, + "db_id": "toxicology", + "question": "Among all chemical compounds that contain molecule TR047, identify the percent that form a double-bond.", + "evidence": "TR047 is the molecule id; double bond refers to bond_type = ' = '; percentage = DIVIDE(SUM(bond_type = ' = '), COUNT(all bond_id)) as percent where molecule_id = 'TR047'", + "SQL": "SELECT CAST(COUNT(CASE WHEN T.bond_type = '=' THEN T.bond_id ELSE NULL END) AS REAL) * 100 / COUNT(T.bond_id) FROM bond AS T WHERE T.molecule_id = 'TR047'", + "difficulty": "moderate" + }, + { + "question_id": 288, + "db_id": "toxicology", + "question": "Identify whether the molecule that contains atom TR001_1 is carcinogenic.", + "evidence": "label = '+' mean molecules are carcinogenic;", + "SQL": "SELECT T2.label AS flag_carcinogenic FROM atom AS T1 INNER JOIN molecule AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.atom_id = 'TR001_1'", + "difficulty": "simple" + }, + { + "question_id": 291, + "db_id": "toxicology", + "question": "How many chemical compounds in the database are identified as carcinogenic.", + "evidence": "label = '+' mean molecules are carcinogenic;", + "SQL": "SELECT COUNT(T.molecule_id) FROM molecule AS T WHERE T.label = '+'", + "difficulty": "simple" + }, + { + "question_id": 303, + "db_id": "toxicology", + "question": "How many double bonds does TR006 have and is it carcinogenic?", + "evidence": "label = '+' mean molecules are carcinogenic; label = '-' means molecules are non-carcinogenic; double bond refers to bond_type = ' = ';", + "SQL": "SELECT COUNT(T1.bond_id), T2.label FROM bond AS T1 INNER JOIN molecule AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.bond_type = '=' AND T2.molecule_id = 'TR006' GROUP BY T2.label", + "difficulty": "moderate" + }, + { + "question_id": 307, + "db_id": "toxicology", + "question": "Name the atoms' elements that form bond TR000_2_3.", + "evidence": "element = 'cl' means Chlorine; element = 'c' means Carbon; element = 'h' means Hydrogen; element = 'o' means Oxygen, element = 's' means Sulfur; element = 'n' means Nitrogen, element = 'p' means Phosphorus, element = 'na' means Sodium, element = 'br' means Bromine, element = 'f' means Fluorine; element = 'i' means Iodine; element = 'sn' means Tin; element = 'pb' means Lead; element = 'te' means Tellurium; element = 'ca' means Calcium", + "SQL": "SELECT T2.element FROM connected AS T1 INNER JOIN atom AS T2 ON T1.atom_id = T2.atom_id WHERE T1.bond_id = 'TR000_2_3'", + "difficulty": "challenging" + }, + { + "question_id": 309, + "db_id": "toxicology", + "question": "List out the atom id that belongs to the TR346 molecule and how many bond type can be created by this molecule?", + "evidence": "", + "SQL": "SELECT T1.atom_id, COUNT(DISTINCT T2.bond_type) FROM atom AS T1 INNER JOIN bond AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.molecule_id = 'TR346' GROUP BY T1.atom_id, T2.bond_type", + "difficulty": "simple" + }, + { + "question_id": 326, + "db_id": "toxicology", + "question": "Which molecule consisted of Sulphur atom with double bond?", + "evidence": "sulphur refers to element - 's'; double bond refers to bond_type = ' = ';", + "SQL": "SELECT DISTINCT T1.molecule_id FROM atom AS T1 INNER JOIN bond AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.element = 's' AND T2.bond_type = '='", + "difficulty": "simple" + }, + { + "question_id": 337, + "db_id": "toxicology", + "question": "List the element and bond type included in the molecule with molecule ID of TR016.", + "evidence": "element = 'cl' means Chlorine; element = 'c' means Carbon; element = 'h' means Hydrogen; element = 'o' means Oxygen, element = 's' means Sulfur; element = 'n' means Nitrogen, element = 'p' means Phosphorus, element = 'na' means Sodium, element = 'br' means Bromine, element = 'f' means Fluorine; element = 'i' means Iodine; element = 'sn' means Tin; element = 'pb' means Lead; element = 'te' means Tellurium; element = 'ca' means Calcium; double bond refers to bond_type = ' = '; single bond refers to bond_type = '-'; triple bond refers to bond_type = '#';", + "SQL": "SELECT DISTINCT T1.element, T2.bond_type FROM atom AS T1 INNER JOIN bond AS T2 ON T1.molecule_id = T2.molecule_id WHERE T1.molecule_id = 'TR016'", + "difficulty": "challenging" + }, + { + "question_id": 338, + "db_id": "toxicology", + "question": "What is the atom ID of double bonded carbon in TR012 molecule?", + "evidence": "carbon refers to element = 'c'; double bond refers to bond_type = ' = ';", + "SQL": "SELECT T1.atom_id FROM atom AS T1 INNER JOIN molecule AS T2 ON T1.molecule_id = T2.molecule_id INNER JOIN bond AS T3 ON T2.molecule_id = T3.molecule_id WHERE T2.molecule_id = 'TR012' AND T3.bond_type = '=' AND T1.element = 'c'", + "difficulty": "moderate" + }, + { + "question_id": 347, + "db_id": "card_games", + "question": "Find all cards illustrated by Stephen Daniel and describe the text of the ruling of these cards. State if these cards have missing or degraded properties and values.", + "evidence": "cards have missing or degraded properties and value refers to hasContentWarning = 1; 'Stephen Daniele' is artist;", + "SQL": "SELECT T1.id, T2.text, T1.hasContentWarning FROM cards AS T1 INNER JOIN rulings AS T2 ON T1.uuid = T2.uuid WHERE T1.artist = 'Stephen Daniele'", + "difficulty": "moderate" + }, + { + "question_id": 350, + "db_id": "card_games", + "question": "State the alternative languages available for card named Annul numbered 29.", + "evidence": "annul refers to name = 'annul'; numbered 29 refers to number = '29';", + "SQL": "SELECT T2.language FROM cards AS T1 INNER JOIN foreign_data AS T2 ON T1.uuid = T2.uuid WHERE T1.name = 'Annul' AND T1.number = 29", + "difficulty": "simple" + }, + { + "question_id": 353, + "db_id": "card_games", + "question": "List all the sets available in Italian translation. State the total number of cards per set.", + "evidence": "Italian translation refers to language = 'Italian'; total number of card per set refers to totalSetSize;", + "SQL": "SELECT T1.name, T1.totalSetSize FROM sets AS T1 INNER JOIN set_translations AS T2 ON T1.code = T2.setCode WHERE T2.language = 'Italian'", + "difficulty": "simple" + }, + { + "question_id": 362, + "db_id": "card_games", + "question": "What is the description about the ruling of card \"Condemn\"?", + "evidence": "Ancestor's Chosen' is the name of card; description about the ruling refers to text;", + "SQL": "SELECT T2.text FROM cards AS T1 INNER JOIN rulings AS T2 ON T1.uuid = T2.uuid WHERE T1.name = 'Condemn'", + "difficulty": "simple" + }, + { + "question_id": 387, + "db_id": "card_games", + "question": "What are the cards for set OGW? State the colour for these cards.", + "evidence": "set OGW refers to setCode = 'OGW';", + "SQL": "SELECT id, colors FROM cards WHERE id IN ( SELECT id FROM set_translations WHERE setCode = 'OGW' )", + "difficulty": "simple" + }, + { + "question_id": 402, + "db_id": "card_games", + "question": "What is the percentage of Story Spotlight cards that also have a text box? List them by their ID.", + "evidence": "Story Spotlight cards that do not have a text box refers to isStorylight = 1 and isTextless = 1; Percentage refer to DIVIDE(SUM(count(id) where isStorylight = 1), SUM(count(id))) * 100\n\n", + "SQL": "SELECT CAST(SUM(CASE WHEN isTextless = 0 THEN 1 ELSE 0 END) AS REAL) * 100 / COUNT(id) FROM cards WHERE isStorySpotlight = 1", + "difficulty": "moderate" + }, + { + "question_id": 419, + "db_id": "card_games", + "question": "How many color cards with no borders have been ranked higher than 12000 on EDHRec?", + "evidence": "color cards with no borders refers to borderColor = 'borderless'; ranked higher than 12000 on EDHRec refers to edhrecRank > 12000", + "SQL": "SELECT COUNT(id) FROM cards WHERE edhrecRank > 12000 AND borderColor = 'borderless'", + "difficulty": "simple" + }, + { + "question_id": 423, + "db_id": "card_games", + "question": "Please provide the ids of top three powerful pairs of Kingdom Foil and Kingdom Cards sorted by Kingdom Foil id in alphabetical order.", + "evidence": "poweful refers to cardKingdomFoilId is not null AND cardKingdomId is not null", + "SQL": "SELECT cardKingdomFoilId, cardKingdomId FROM cards WHERE cardKingdomFoilId IS NOT NULL AND cardKingdomId IS NOT NULL ORDER BY cardKingdomFoilId LIMIT 3", + "difficulty": "simple" + }, + { + "question_id": 436, + "db_id": "card_games", + "question": "How many cards have frame effect as extendedart? List out the id of those cards.", + "evidence": "\nframe effect as extendedart refers to frameEffects = 'extendedart'\n", + "SQL": "SELECT id FROM cards WHERE frameEffects = 'extendedart' GROUP BY id", + "difficulty": "simple" + }, + { + "question_id": 448, + "db_id": "card_games", + "question": "Name the foreign name of the card that has abzan watermark? List out the type of this card.", + "evidence": "", + "SQL": "SELECT DISTINCT T1.name, T1.type FROM cards AS T1 INNER JOIN foreign_data AS T2 ON T2.uuid = T1.uuid WHERE T1.watermark = 'abzan'", + "difficulty": "simple" + }, + { + "question_id": 470, + "db_id": "card_games", + "question": "When was the set of cards with \"Ancestor's Chosen\" released?", + "evidence": "card set \"Ancestor's Chosen\" refers to name = 'Ancestor''s Chosen'; when released refers to releaseDate", + "SQL": "SELECT DISTINCT T2.releaseDate FROM cards AS T1 INNER JOIN sets AS T2 ON T2.code = T1.setCode WHERE T1.name = 'Ancestor''s Chosen'", + "difficulty": "simple" + }, + { + "question_id": 479, + "db_id": "card_games", + "question": "Among the cards with converted mana cost higher than 5 in the set Coldsnap, how many of them have unknown power?", + "evidence": "card set Coldsnap refers to name = 'Coldsnap'; converted mana cost higher than 5 refers to convertedManaCost > 5; unknown power refers to power = '*' or T1.power is null", + "SQL": "SELECT SUM(CASE WHEN T1.power LIKE '%*%' OR T1.power IS NULL THEN 1 ELSE 0 END) FROM cards AS T1 INNER JOIN sets AS T2 ON T2.code = T1.setCode WHERE T2.name = 'Coldsnap' AND T1.convertedManaCost > 5", + "difficulty": "moderate" + }, + { + "question_id": 483, + "db_id": "card_games", + "question": "Please list the Italian text ruling of all the cards in the set Coldsnap.", + "evidence": "card set Coldsnap refers to name = 'Coldsnap'; Italian refers to language = 'Italian'", + "SQL": "SELECT DISTINCT T1.text FROM foreign_data AS T1 INNER JOIN cards AS T2 ON T2.uuid = T1.uuid INNER JOIN sets AS T3 ON T3.code = T2.setCode WHERE T3.name = 'Coldsnap' AND T1.language = 'Italian'", + "difficulty": "moderate" + }, + { + "question_id": 484, + "db_id": "card_games", + "question": "Please list the Italian names of the cards in the set Coldsnap with the highest converted mana cost.", + "evidence": "card set Coldsnap refers to name = 'Coldsnap'; Italian refers to language = 'Italian'", + "SQL": "SELECT T2.name FROM foreign_data AS T1 INNER JOIN cards AS T2 ON T2.uuid = T1.uuid INNER JOIN sets AS T3 ON T3.code = T2.setCode WHERE T3.name = 'Coldsnap' AND T1.language = 'Italian' ORDER BY T2.convertedManaCost DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 489, + "db_id": "card_games", + "question": "List the keyrune code for the set whose code is 'PKHC'.", + "evidence": "keyrune code refers to keyruneCode", + "SQL": "SELECT keyruneCode FROM sets WHERE code = 'PKHC'", + "difficulty": "simple" + }, + { + "question_id": 510, + "db_id": "card_games", + "question": "Among the cards that doesn't have multiple faces on the same card, who is the illustrator of the card art that has the highest cost of converted mana?", + "evidence": "doesn't have multiple faces refers to side IS NULL; illustrator refers to artist", + "SQL": "SELECT artist FROM cards WHERE side IS NULL ORDER BY convertedManaCost DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 515, + "db_id": "card_games", + "question": "When was the oldest mythic card released and what are its legal play formats?", + "evidence": "the oldest card refers to MIN(originalReleaseDate); mythic card refers to rarity = 'mythic'; legal play refers to status = 'legal'; play format refers to format", + "SQL": "SELECT T1.originalReleaseDate, T2.format FROM cards AS T1 INNER JOIN legalities AS T2 ON T1.uuid = T2.uuid WHERE T1.rarity = 'mythic' AND T1.originalReleaseDate IS NOT NULL AND T2.status = 'Legal' ORDER BY T1.originalReleaseDate LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 520, + "db_id": "card_games", + "question": "Who is the illustrator that illustrated the least amount of cards? List the format of play of the cards that he/she illustrated.", + "evidence": "format of the cards refers to format; illustrator refers to artist; the least amount of cards refers to MIN(artist)", + "SQL": "SELECT T1.artist, T2.format FROM cards AS T1 INNER JOIN legalities AS T2 ON T2.uuid = T1.uuid GROUP BY T1.artist ORDER BY COUNT(T1.id) ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 529, + "db_id": "card_games", + "question": "Find and list the names of sets which doesn't have Japanese translation but have Korean translation.", + "evidence": "names of sets refers to name; doesn't have Japanese translation refers to language not like '%Japanese%'; have Korean translation refers to language = 'Korean'", + "SQL": "SELECT name FROM sets WHERE code IN ( SELECT setCode FROM set_translations WHERE language = 'Korean' AND language NOT LIKE '%Japanese%' )", + "difficulty": "moderate" + }, + { + "question_id": 538, + "db_id": "codebase_community", + "question": "Please list the titles of the posts owned by the user csgillespie?", + "evidence": "\"csgillespie\" is the DisplayName of user", + "SQL": "SELECT T1.Title FROM posts AS T1 INNER JOIN users AS T2 ON T1.OwnerUserId = T2.Id WHERE T2.DisplayName = 'csgillespie'", + "difficulty": "simple" + }, + { + "question_id": 541, + "db_id": "codebase_community", + "question": "What is the display name of the user who is the owner of the most valuable post?", + "evidence": "most valuable post refers to Max(FavoriteCount)", + "SQL": "SELECT T2.DisplayName FROM posts AS T1 INNER JOIN users AS T2 ON T1.OwnerUserId = T2.Id ORDER BY T1.FavoriteCount DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 549, + "db_id": "codebase_community", + "question": "From which post is the tag \"bayesian\" excerpted from? Please give the body of the post.", + "evidence": "\"bayesian\" is the TagName; excerpt from refers to ExcerptPostId", + "SQL": "SELECT T2.Body FROM tags AS T1 INNER JOIN posts AS T2 ON T2.Id = T1.ExcerptPostId WHERE T1.TagName = 'bayesian'", + "difficulty": "simple" + }, + { + "question_id": 575, + "db_id": "codebase_community", + "question": "What is the badge name that user 'SilentGhost' obtained?", + "evidence": "\"SilentGhost\" is the DisplayName of user;", + "SQL": "SELECT T2.Name FROM users AS T1 INNER JOIN badges AS T2 ON T1.Id = T2.UserId WHERE T1.DisplayName = 'SilentGhost'", + "difficulty": "simple" + }, + { + "question_id": 586, + "db_id": "codebase_community", + "question": "Which user added a bounty amount of 50 to the post title mentioning variance?", + "evidence": "\"bounty amount of 50 refers to BountyAmount = 50; user refers to DisplayName", + "SQL": "SELECT T3.DisplayName, T1.Title FROM posts AS T1 INNER JOIN votes AS T2 ON T1.Id = T2.PostId INNER JOIN users AS T3 ON T3.Id = T2.UserId WHERE T2.BountyAmount = 50 AND T1.Title LIKE '%variance%'", + "difficulty": "challenging" + }, + { + "question_id": 591, + "db_id": "codebase_community", + "question": "How many users are awarded with supporter badge during year 2011?", + "evidence": "\"Supporter\" is the Name of badge; in year 2011 refers to year(Date) = 2011", + "SQL": "SELECT COUNT(Id) FROM badges WHERE STRFTIME('%Y', Date) = '2011' AND Name = 'Supporter'", + "difficulty": "simple" + }, + { + "question_id": 593, + "db_id": "codebase_community", + "question": "How many users from New York have a teacher and supporter badge?", + "evidence": "\"Supporter\" and \"Teachers\" are both Name of badge; 'New York' is the Location; user refers to UserId", + "SQL": "SELECT COUNT(DISTINCT T1.Id) FROM badges AS T1 INNER JOIN users AS T2 ON T1.UserId = T2.Id WHERE T1.Name IN ('Supporter', 'Teacher') AND T2.Location = 'New York'", + "difficulty": "simple" + }, + { + "question_id": 600, + "db_id": "codebase_community", + "question": "List out all post that are related to post ID 61217 and what is the popularity of this post?", + "evidence": "post related refers to RelatedPostId; popularity refers to ViewCount", + "SQL": "SELECT T1.ViewCount FROM posts AS T1 INNER JOIN postLinks AS T2 ON T1.Id = T2.PostId WHERE T2.PostId = 61217", + "difficulty": "simple" + }, + { + "question_id": 613, + "db_id": "codebase_community", + "question": "List out the dates that users who are located in Rochester, NY obtained their badges?", + "evidence": "\"Rochester, NY\" is the Location of user; user refers to UserId", + "SQL": "SELECT T2.Date FROM users AS T1 INNER JOIN badges AS T2 ON T1.Id = T2.UserId WHERE T1.Location = 'Rochester, NY'", + "difficulty": "simple" + }, + { + "question_id": 617, + "db_id": "codebase_community", + "question": "What is the detailed content of the comment of the post which was created on 7/19/2010 7:37:33 PM?", + "evidence": "detailed content of the comment refers to Text; created on 7/19/2010 7:37:33 PM CreationDate = 2010-07-19 19:37:33.0'", + "SQL": "SELECT T1.Text FROM comments AS T1 INNER JOIN posts AS T2 ON T1.PostId = T2.Id WHERE T1.CreationDate = '2010-07-19 19:37:33.0'", + "difficulty": "simple" + }, + { + "question_id": 619, + "db_id": "codebase_community", + "question": "How many adults who obtained the badge Supporter?", + "evidence": "Supporter is the Name of badge; adult refers to Age BETWEEN 19 AND 65", + "SQL": "SELECT COUNT(T1.Id) FROM users AS T1 INNER JOIN badges AS T2 ON T1.Id = T2.UserId WHERE T2.Name = 'Supporter' AND T1.Age BETWEEN 19 AND 65", + "difficulty": "simple" + }, + { + "question_id": 623, + "db_id": "codebase_community", + "question": "How many elders obtained the \"Supporter\" badge?", + "evidence": "\"Supporter\" is the Name of badge;\u00a0 elders refers to Age > 65", + "SQL": "SELECT COUNT(T1.Id) FROM users AS T1 INNER JOIN badges AS T2 ON T1.Id = T2.UserId WHERE T1.Age > 65 AND T2.Name = 'Supporter'", + "difficulty": "simple" + }, + { + "question_id": 625, + "db_id": "codebase_community", + "question": "How many users were from New York?", + "evidence": "New York refers to Location;", + "SQL": "SELECT COUNT(Id) FROM users WHERE Location = 'New York'", + "difficulty": "simple" + }, + { + "question_id": 631, + "db_id": "codebase_community", + "question": "How many posts were created by Daniel Vassallo?", + "evidence": "DisplayName = 'Daniel Vassallo';", + "SQL": "SELECT COUNT(T1.Id) FROM users AS T1 INNER JOIN postHistory AS T2 ON T1.Id = T2.UserId WHERE T1.DisplayName = 'Daniel Vassallo'", + "difficulty": "simple" + }, + { + "question_id": 653, + "db_id": "codebase_community", + "question": "What is the owner's display name of the most popular post?", + "evidence": "Higher view count means the post has higher popularity; the most popular post refers to MAX(ViewCount);", + "SQL": "SELECT DisplayName FROM users WHERE Id = ( SELECT OwnerUserId FROM posts ORDER BY ViewCount DESC LIMIT 1 )", + "difficulty": "simple" + }, + { + "question_id": 656, + "db_id": "codebase_community", + "question": "Describe the display name of the parent ID for child post with the highest score.", + "evidence": "If the parent id is not null, the post is the child post; the highest score refers to MAX(Score);", + "SQL": "SELECT DisplayName FROM users WHERE Id = ( SELECT OwnerUserId FROM posts WHERE ParentId IS NOT NULL ORDER BY Score DESC LIMIT 1 )", + "difficulty": "simple" + }, + { + "question_id": 657, + "db_id": "codebase_community", + "question": "Under the vote type of 8, provide the display names and websites URLs of the user who got the highest bounty amount.", + "evidence": "vote type of 8 refers to VoteTypeId = 8; the highest bounty amount refers to MAX(BountyAmount);", + "SQL": "SELECT DisplayName, WebsiteUrl FROM users WHERE Id = ( SELECT UserId FROM votes WHERE VoteTypeId = 8 ORDER BY BountyAmount DESC LIMIT 1 )", + "difficulty": "moderate" + }, + { + "question_id": 690, + "db_id": "codebase_community", + "question": "Identify the latest badge awarded to the user with the display name Emmett.", + "evidence": "the latest badge refers to Name FROM badges where MAX(Date);", + "SQL": "SELECT T1.Name FROM badges AS T1 INNER JOIN users AS T2 ON T1.UserId = T2.Id WHERE T2.DisplayName = 'Emmett' ORDER BY T1.Date DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 720, + "db_id": "superhero", + "question": "Please list the full names of all the superheroes with over 15 super powers.", + "evidence": "15 super powers refers to COUNT(full_name) > 15", + "SQL": "SELECT DISTINCT T1.full_name FROM superhero AS T1 INNER JOIN hero_power AS T2 ON T1.id = T2.hero_id GROUP BY T1.full_name HAVING COUNT(T2.power_id) > 15", + "difficulty": "simple" + }, + { + "question_id": 726, + "db_id": "superhero", + "question": "Please give the full name of the tallest hero published by Marvel Comics.", + "evidence": "the tallest hero refers to MAX(height_cm); published by Marvel Comics refers to publisher_name = 'Marvel Comics'", + "SQL": "SELECT T1.full_name FROM superhero AS T1 INNER JOIN publisher AS T2 ON T1.publisher_id = T2.id WHERE T2.publisher_name = 'Marvel Comics' ORDER BY T1.height_cm DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 729, + "db_id": "superhero", + "question": "What is the average height of the superheroes from Marvel Comics?", + "evidence": "superheroes from Marvel Comics refers to publisher_name = 'Marvel Comics'; average height of the superheroes refers to AVG(height_cm)", + "SQL": "SELECT AVG(T1.height_cm) FROM superhero AS T1 INNER JOIN publisher AS T2 ON T1.publisher_id = T2.id WHERE T2.publisher_name = 'Marvel Comics'", + "difficulty": "simple" + }, + { + "question_id": 732, + "db_id": "superhero", + "question": "Which publisher published the slowest superhero?", + "evidence": "the slowest superhero refers to attribute_name = 'Speed' where MIN(attribute_value); publisher refers to publisher_name", + "SQL": "SELECT T2.publisher_name FROM superhero AS T1 INNER JOIN publisher AS T2 ON T1.publisher_id = T2.id INNER JOIN hero_attribute AS T3 ON T1.id = T3.hero_id INNER JOIN attribute AS T4 ON T3.attribute_id = T4.id WHERE T4.attribute_name = 'Speed' ORDER BY T3.attribute_value LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 734, + "db_id": "superhero", + "question": "What is the publisher's name of Blue Beetle II?", + "evidence": "Blue Beetle II refers to superhero_name = 'Blue Beetle II'", + "SQL": "SELECT T2.publisher_name FROM superhero AS T1 INNER JOIN publisher AS T2 ON T1.publisher_id = T2.id WHERE T1.superhero_name = 'Blue Beetle II'", + "difficulty": "simple" + }, + { + "question_id": 758, + "db_id": "superhero", + "question": "Provide the hair colour of the human superhero who is 185 cm tall.", + "evidence": "185 cm tall refers to height_cm = 185; human superhero refers to race = 'human'; hair colour refers to colour where hair_colour_id = colour.id;", + "SQL": "SELECT DISTINCT T3.colour FROM superhero AS T1 INNER JOIN race AS T2 ON T1.race_id = T2.id INNER JOIN colour AS T3 ON T1.hair_colour_id = T3.id WHERE T1.height_cm = 185 AND T2.race = 'Human'", + "difficulty": "moderate" + }, + { + "question_id": 760, + "db_id": "superhero", + "question": "In superheroes with height between 150 to 180, what is the percentage of heroes published by Marvel Comics?", + "evidence": "height between 150 to 180 refers to height_cm BETWEEN 150 AND 180; heroes published by Marvel Comics refers to publisher_id = 13; calculation = MULTIPLY(DIVIDE(SUM(publisher.id = 13)), COUNT(publisher.id), 100)", + "SQL": "SELECT CAST(COUNT(CASE WHEN T2.publisher_name = 'Marvel Comics' THEN 1 ELSE NULL END) AS REAL) * 100 / COUNT(T1.id) FROM superhero AS T1 INNER JOIN publisher AS T2 ON T1.publisher_id = T2.id WHERE T1.height_cm BETWEEN 150 AND 180", + "difficulty": "challenging" + }, + { + "question_id": 766, + "db_id": "superhero", + "question": "What is the hero's full name with the highest attribute in strength?", + "evidence": "highest attribute in strength refers to MAX(attribute_value) WHERE attribute_name = 'strength';", + "SQL": "SELECT T1.full_name FROM superhero AS T1 INNER JOIN hero_attribute AS T2 ON T1.id = T2.hero_id INNER JOIN attribute AS T3 ON T2.attribute_id = T3.id WHERE T3.attribute_name = 'Strength' ORDER BY T2.attribute_value DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 772, + "db_id": "superhero", + "question": "List the eyes, hair and skin colour of all female superheroes published by Dark Horse Comics.", + "evidence": "eyes refers to eye_colour_id; hair refers to hair_colour_id; skin colour refers to skin_colour_id; female superheroes refers to gender = 'Female'; published by Dark Horse Comics refers to publisher_name = 'Dark Horse Comics';", + "SQL": "SELECT T1.eye_colour_id, T1.hair_colour_id, T1.skin_colour_id FROM superhero AS T1 INNER JOIN publisher AS T2 ON T2.id = T1.publisher_id INNER JOIN gender AS T3 ON T3.id = T1.gender_id WHERE T2.publisher_name = 'Dark Horse Comics' AND T3.gender = 'Female'", + "difficulty": "challenging" + }, + { + "question_id": 778, + "db_id": "superhero", + "question": "Provide superheroes' names who have the adaptation power.", + "evidence": "adaptation power refers to power_name = 'Adaptation';", + "SQL": "SELECT T1.superhero_name FROM superhero AS T1 INNER JOIN hero_power AS T2 ON T1.id = T2.hero_id INNER JOIN superpower AS T3 ON T2.power_id = T3.id WHERE T3.power_name = 'Adaptation'", + "difficulty": "simple" + }, + { + "question_id": 797, + "db_id": "superhero", + "question": "Which superheroes have blue eyes with brown hair?", + "evidence": "which superheroes refers to superhero_name; blue eyes refers to eye_colour_id = 7; brown hair refers to hair_colour_id = 9;", + "SQL": "SELECT T1.superhero_name FROM superhero AS T1 INNER JOIN colour AS T2 ON T1.eye_colour_id = T2.id INNER JOIN colour AS T3 ON T1.hair_colour_id = T3.id WHERE T2.colour = 'Blue' AND T3.colour = 'Brown'", + "difficulty": "moderate" + }, + { + "question_id": 812, + "db_id": "superhero", + "question": "List down at least five full names of superheroes with blue eyes.", + "evidence": "blue eyes refers to colour.colour = 'Blue' WHERE eye_colour_id = colour.id;", + "SQL": "SELECT T1.superhero_name FROM superhero AS T1 INNER JOIN colour AS T2 ON T1.eye_colour_id = T2.id WHERE T2.colour = 'Blue' LIMIT 5", + "difficulty": "simple" + }, + { + "question_id": 847, + "db_id": "formula_1", + "question": "What is the surname of the driver with the best lap time in race number 19 in the second period?", + "evidence": "race number refers to raceId; second qualifying period refers to q2; best lap time refers to MIN(q2);", + "SQL": "SELECT T2.surname FROM qualifying AS T1 INNER JOIN drivers AS T2 ON T2.driverId = T1.driverId WHERE T1.raceId = 19 AND T1.q2 IS NOT NULL ORDER BY T1.q2 ASC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 857, + "db_id": "formula_1", + "question": "Give the coordinate position for Abu Dhabi Grand Prix.", + "evidence": "coordinates refers to (lat, lng); position and location shares the same meaning.", + "SQL": "SELECT DISTINCT T1.lat, T1.lng, T1.location FROM circuits AS T1 INNER JOIN races AS T2 ON T2.circuitID = T1.circuitId WHERE T2.name = 'Abu Dhabi Grand Prix'", + "difficulty": "simple" + }, + { + "question_id": 860, + "db_id": "formula_1", + "question": "For the driver who had the Q2 time as 0:01:40 in the qualifying race No. 355, what is his nationality?", + "evidence": "race number refers to raceId;", + "SQL": "SELECT DISTINCT T2.nationality FROM qualifying AS T1 INNER JOIN drivers AS T2 ON T2.driverId = T1.driverId WHERE T1.raceId = 355 AND T1.q2 LIKE '1:40%'", + "difficulty": "simple" + }, + { + "question_id": 864, + "db_id": "formula_1", + "question": "For the race happened on 2015/11/29, how many drivers finished the game?", + "evidence": "game and race are synonyms; drivers who finished the race should have record in time;", + "SQL": "SELECT COUNT(T2.driverId) FROM races AS T1 INNER JOIN results AS T2 ON T2.raceId = T1.raceId WHERE T1.date = '2015-11-29' AND T2.time IS NOT NULL", + "difficulty": "simple" + }, + { + "question_id": 882, + "db_id": "formula_1", + "question": "Which year was the first Singapore Grand Prix?", + "evidence": "the first race refers to race happened in min(year);", + "SQL": "SELECT year FROM races WHERE name = 'Singapore Grand Prix' ORDER BY year ASC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 896, + "db_id": "formula_1", + "question": "Calculate the percentage whereby Hamilton was not at the 1st track of the the f1 circuit since 2010.", + "evidence": "DIVIDE(COUNT(raceId) where surname = 'Hamilton', year >= 2010 and position>1), (COUNT(raceId) where surname = 'Hamilton', year >= 2010) as percentage;", + "SQL": "SELECT CAST(COUNT(CASE WHEN T2.position <> 1 THEN T2.position END) AS REAL) * 100 / COUNT(T2.driverStandingsId) FROM races AS T1 INNER JOIN driverStandings AS T2 ON T2.raceId = T1.raceId INNER JOIN drivers AS T3 ON T3.driverId = T2.driverId WHERE T3.surname = 'Hamilton' AND T1.year >= 2010", + "difficulty": "challenging" + }, + { + "question_id": 929, + "db_id": "formula_1", + "question": "Please list the Formula_1 races that Lewis Hamilton participated.", + "evidence": "", + "SQL": "SELECT T1.name FROM races AS T1 INNER JOIN results AS T2 ON T2.raceId = T1.raceId INNER JOIN drivers AS T3 ON T3.driverId = T2.driverId WHERE T3.forename = 'Lewis' AND T3.surname = 'Hamilton'", + "difficulty": "simple" + }, + { + "question_id": 937, + "db_id": "formula_1", + "question": "What's the finish time for the driver who ranked second in 2008's Australian Grand Prix?", + "evidence": "finish time refers to time", + "SQL": "SELECT T1.time FROM results AS T1 INNER JOIN races AS T2 on T1.raceId = T2.raceId WHERE T1.rank = 2 AND T2.name = 'Australian Grand Prix' AND T2.year = 2008", + "difficulty": "simple" + }, + { + "question_id": 939, + "db_id": "formula_1", + "question": "How many drivers from the USA participated in the 2008 Australian Grand Prix?", + "evidence": "from the USA refers to nationality = 'American'", + "SQL": "SELECT COUNT(*) FROM drivers AS T1 INNER JOIN results AS T2 ON T1.driverId = T2.driverId INNER JOIN races AS T3 ON T3.raceId = T2.raceId WHERE T3.name = 'Australian GrAND Prix' AND T1.nationality = 'American' AND T3.year = 2008", + "difficulty": "moderate" + }, + { + "question_id": 944, + "db_id": "formula_1", + "question": "How much faster in percentage is the champion than the driver who finished the race last in the 2008 Australian Grand Prix?", + "evidence": "how much faster in percentage = divide(subtract(incremental time, champion time), last_driver time) * 100%; last driver finished time = incremental time + champion time; only champion's finished time is represented by 'HH:MM:SS.mmm'; finished the game refers to time is not null", + "SQL": "WITH time_in_seconds AS ( SELECT T1.positionOrder, CASE WHEN T1.positionOrder = 1 THEN (CAST(SUBSTR(T1.time, 1, 1) AS REAL) * 3600) + (CAST(SUBSTR(T1.time, 3, 2) AS REAL) * 60) + CAST(SUBSTR(T1.time, 6) AS REAL) ELSE CAST(SUBSTR(T1.time, 2) AS REAL) END AS time_seconds FROM results AS T1 INNER JOIN races AS T2 ON T1.raceId = T2.raceId WHERE T2.name = 'Australian Grand Prix' AND T1.time IS NOT NULL AND T2.year = 2008 ), champion_time AS ( SELECT time_seconds FROM time_in_seconds WHERE positionOrder = 1), last_driver_incremental AS ( SELECT time_seconds FROM time_in_seconds WHERE positionOrder = (SELECT MAX(positionOrder) FROM time_in_seconds) ) SELECT (CAST((SELECT time_seconds FROM last_driver_incremental) AS REAL) * 100) / (SELECT time_seconds + (SELECT time_seconds FROM last_driver_incremental) FROM champion_time)", + "difficulty": "challenging" + }, + { + "question_id": 956, + "db_id": "formula_1", + "question": "Which drivers born after 1975 have been ranked 2? Please give their forenames and surnames.", + "evidence": "born after 1975 refers to year(dob) >1975;", + "SQL": "SELECT T2.forename, T2.surname FROM results AS T1 INNER JOIN drivers AS T2 on T1.driverId = T2.driverId WHERE STRFTIME('%Y', T2.dob) > '1975' AND T1.rank = 2", + "difficulty": "simple" + }, + { + "question_id": 971, + "db_id": "formula_1", + "question": "Please state the reference name of the oldest German driver.", + "evidence": "oldest refers to MIN(year(dob)); reference names appear in drverRef.", + "SQL": "SELECT driverRef FROM drivers WHERE nationality = 'German' ORDER BY JULIANDAY(dob) ASC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 972, + "db_id": "formula_1", + "question": "Which drivers who were born in 1971 and has the fastest lap time on the race? Give id and code of these drivers.", + "evidence": "born in 1971 refers to year(dob) = 1971; has the fastest lap time refers to fastestLapTime has values", + "SQL": "SELECT T2.driverId, T2.code FROM results AS T1 INNER JOIN drivers AS T2 on T1.driverId = T2.driverId WHERE STRFTIME('%Y', T2.dob) = '1971' AND T1.fastestLapTime IS NOT NULL", + "difficulty": "moderate" + }, + { + "question_id": 996, + "db_id": "formula_1", + "question": "What is the annual average number of races held during the first 10 years of the 21st century?", + "evidence": "races in date between '2000-01-01' and '2010-12-31'", + "SQL": "SELECT CAST(SUM(CASE WHEN year BETWEEN 2000 AND 2010 THEN 1 ELSE 0 END) AS REAL) / 10 FROM races WHERE date BETWEEN '2000-01-01' AND '2010-12-31'", + "difficulty": "simple" + }, + { + "question_id": 997, + "db_id": "formula_1", + "question": "Which citizenship do the vast majority of the drivers hold?", + "evidence": "Citizenship of majority of drivers = MAX(nationality); citizenship and nationality are synonyms\n\n", + "SQL": "SELECT nationality FROM drivers GROUP BY nationality ORDER BY COUNT(driverId) DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 1003, + "db_id": "formula_1", + "question": "How many accidents did the driver who had the highest number accidents in the Canadian Grand Prix have?", + "evidence": "number of accidents refers to the number where statusid = 3; Canadian Grand Prix refers to the race of name\n", + "SQL": "SELECT COUNT(T1.driverId) FROM results AS T1 INNER JOIN races AS T2 on T1.raceId = T2.raceId INNER JOIN status AS T3 on T1.statusId = T3.statusId WHERE T3.statusId = 3 AND T2.name = 'Canadian Grand Prix' GROUP BY T1.driverId ORDER BY COUNT(T1.driverId) DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1013, + "db_id": "formula_1", + "question": "What is the lap record for the Austrian Grand Prix Circuit?", + "evidence": "lap record means the fastest time recorded which refers to time", + "SQL": "WITH fastest_lap_times AS ( SELECT T1.raceId, T1.fastestLapTime FROM results AS T1 WHERE T1.FastestLapTime IS NOT NULL) SELECT MIN(fastest_lap_times.fastestLapTime) as lap_record FROM fastest_lap_times INNER JOIN races AS T2 on fastest_lap_times.raceId = T2.raceId INNER JOIN circuits AS T3 on T2.circuitId = T3.circuitId WHERE T2.name = 'Austrian Grand Prix'", + "difficulty": "simple" + }, + { + "question_id": 1032, + "db_id": "european_football_2", + "question": "Give the name of the league with the highest matches of all time and how many matches were played in the said league.", + "evidence": "name of the league refers to League.name; league with highest matches of all time refers to MAX(COUNT(league_id));", + "SQL": "SELECT t2.name, COUNT(t1.id) FROM Match AS t1 INNER JOIN League AS t2 ON t1.league_id = t2.id GROUP BY t2.name ORDER BY COUNT(t1.id) DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1035, + "db_id": "european_football_2", + "question": "Give the team_fifa_api_id of teams with more than 50 but less than 60 build-up play speed.", + "evidence": "teams with more than 50 but less than 60 build-up play speed refers to buildUpPlaySpeed BETWEEN 51 AND 59;", + "SQL": "SELECT DISTINCT team_fifa_api_id FROM Team_Attributes WHERE buildUpPlaySpeed > 50 AND buildUpPlaySpeed < 60", + "difficulty": "simple" + }, + { + "question_id": 1036, + "db_id": "european_football_2", + "question": "List the long name of teams with above-average build-up play passing in 2012.", + "evidence": "long name of teams refers to team_long_name; build-up play passing refers to buildUpPlayPassing; above-average build-up play passing = DIVIDE(SUM(buildUpPlayPassing), COUNT(team_long_name) WHERE buildUpPlayPassing IS NOT NULL) < buildUpPlayPassing; in 2012 refers to strftime('%Y', date) = '2012';", + "SQL": "SELECT DISTINCT t4.team_long_name FROM Team_Attributes AS t3 INNER JOIN Team AS t4 ON t3.team_api_id = t4.team_api_id WHERE SUBSTR(t3.`date`, 1, 4) = '2012' AND t3.buildUpPlayPassing > ( SELECT CAST(SUM(t2.buildUpPlayPassing) AS REAL) / COUNT(t1.id) FROM Team AS t1 INNER JOIN Team_Attributes AS t2 ON t1.team_api_id = t2.team_api_id WHERE SUBSTR(t2.`date`, 1, 4) = '2012' )", + "difficulty": "challenging" + }, + { + "question_id": 1040, + "db_id": "european_football_2", + "question": "List the top 10 players' names whose heights are above 180 in descending order of average heading accuracy.", + "evidence": "height > 180; average heading accuracy = DIVIDE(SUM(heading_accuracy), COUNT(player_fifa_api_id));", + "SQL": "SELECT t1.player_name FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t1.height > 180 GROUP BY t1.id ORDER BY CAST(SUM(t2.heading_accuracy) AS REAL) / COUNT(t2.`player_fifa_api_id`) DESC LIMIT 10", + "difficulty": "moderate" + }, + { + "question_id": 1047, + "db_id": "european_football_2", + "question": "What is the football player Francois Affolter header's finishing rate on 18/09/2014?", + "evidence": "header's finishing rate refers to heading_accuracy; on 18/09/2014 refers to date = '2014-09-18 00:00:00';", + "SQL": "SELECT t2.heading_accuracy FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t1.player_name = 'Francois Affolter' AND SUBSTR(t2.`date`, 1, 10) = '2014-09-18'", + "difficulty": "moderate" + }, + { + "question_id": 1063, + "db_id": "european_football_2", + "question": "What is Aaron Doran's potential score?", + "evidence": "potential score refers to potential;", + "SQL": "SELECT t2.potential FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t1.player_name = 'Aaron Doran'", + "difficulty": "simple" + }, + { + "question_id": 1082, + "db_id": "european_football_2", + "question": "Please list the leagues from Germany.", + "evidence": "Germany refers to Country.name = 'Germany';", + "SQL": "SELECT t2.name FROM Country AS t1 INNER JOIN League AS t2 ON t1.id = t2.country_id WHERE t1.name = 'Germany'", + "difficulty": "simple" + }, + { + "question_id": 1085, + "db_id": "european_football_2", + "question": "Which of these players performs the best in crossing actions, Alexis, Ariel Borysiuk or Arouna Kone?", + "evidence": "player who perform best in crossing actions refers to MAX(crossing);", + "SQL": "SELECT t1.player_name, t2.crossing FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t1.player_name IN ('Alexis', 'Ariel Borysiuk', 'Arouna Kone') ORDER BY t2.crossing DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1087, + "db_id": "european_football_2", + "question": "Among the players whose height is over 180, how many of them have a volley score of over 70?", + "evidence": "height > 180; volley score refers to volleys; volleys > 70;", + "SQL": "SELECT COUNT(DISTINCT t1.id) FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t1.height > 180 AND t2.volleys > 70", + "difficulty": "simple" + }, + { + "question_id": 1098, + "db_id": "european_football_2", + "question": "What is Ajax's highest chance creation passing score and what is it classified as?", + "evidence": "Ajax's refers to team_long_name = 'Ajax'; chance creation passing score refers to MAX(chanceCreationPassing); classified as chanceCreationPassingClass", + "SQL": "SELECT t2.chanceCreationPassing, t2.chanceCreationPassingClass FROM Team AS t1 INNER JOIN Team_Attributes AS t2 ON t1.team_api_id = t2.team_api_id WHERE t1.team_long_name = 'Ajax' ORDER BY t2.chanceCreationPassing DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1104, + "db_id": "european_football_2", + "question": "What was the potiential for Francesco Parravicini on 2010/8/30?", + "evidence": "Francesco Parravicini refers to player_name = 'Francesco Parravicini'; on 2010/8/30 refers to date = '2010-08-30 00:00:00'", + "SQL": "SELECT t2.potential FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE SUBSTR(t2.`date`, 1, 10) = '2010-08-30' AND t1.player_name = 'Francesco Parravicini'", + "difficulty": "moderate" + }, + { + "question_id": 1124, + "db_id": "european_football_2", + "question": "Who are the players that tend to be attacking when their mates were doing attack moves? List down their name.", + "evidence": "tend to be attacking when their mates were doing attack moves refers to attacking_work_rate = 'high';", + "SQL": "SELECT DISTINCT t1.player_name FROM Player AS t1 INNER JOIN Player_Attributes AS t2 ON t1.player_api_id = t2.player_api_id WHERE t2.attacking_work_rate = 'high'", + "difficulty": "moderate" + }, + { + "question_id": 1158, + "db_id": "thrombosis_prediction", + "question": "List all patients who were born in 1937 whose total cholesterol was beyond the normal range.", + "evidence": "who were born in 1937 refers to year(birthday) = '1937'; total cholesterol was beyond the normal range refers to `T-CHO` > = '250'", + "SQL": "SELECT DISTINCT T1.ID FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE STRFTIME('%Y', T1.Birthday) = '1937' AND T2.`T-CHO` >= 250", + "difficulty": "moderate" + }, + { + "question_id": 1161, + "db_id": "thrombosis_prediction", + "question": "For in-patient age 50 and above, what is their average anti-cardiolipin antibody (IgG) concentration?", + "evidence": "in-patient refers to Admission = '+'; age 50 and above refers to SUBTRACT(year(current_timestamp), year(Birthday)) >= '50'; average anti-cardiolipin antibody (IgG) concentration refers to AVG(aCL IgG)", + "SQL": "SELECT AVG(T2.`aCL IgG`) FROM Patient AS T1 INNER JOIN Examination AS T2 ON T1.ID = T2.ID WHERE STRFTIME('%Y', CURRENT_TIMESTAMP) - STRFTIME('%Y', T1.Birthday) >= 50 AND T1.Admission = '+'", + "difficulty": "challenging" + }, + { + "question_id": 1182, + "db_id": "thrombosis_prediction", + "question": "For the patient who first came to the hospital on 1991/6/13 who was diagnosed with SJS, what is the total number of his/her Laboratory tests in 1995?", + "evidence": "1991/6/13 refers to `First Date` = '1991-06-13'; 'SJS' refers to Diagnosis; total number of his/her Laboratory tests refers to COUNT(ID); 1995 refers to Date", + "SQL": "SELECT COUNT(*) FROM Laboratory WHERE ID = ( SELECT ID FROM Patient WHERE `First Date` = '1991-06-13' AND Diagnosis = 'SJS' ) AND STRFTIME('%Y', Date) = '1995'", + "difficulty": "moderate" + }, + { + "question_id": 1195, + "db_id": "thrombosis_prediction", + "question": "What is the average blood albumin level for female patients with a PLT greater than 400 who have been diagnosed with SLE?", + "evidence": "average blood albumin level refers to AVG(ALB); female refers to SEX = 'F'; PLT greater than 400 refers to PLT > 400; 'SLE' refers to diagnosis", + "SQL": "SELECT AVG(T2.ALB) FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.PLT > 400 AND T1.Diagnosis = 'SLE' AND T1.SEX = 'F'", + "difficulty": "moderate" + }, + { + "question_id": 1201, + "db_id": "thrombosis_prediction", + "question": "What percentage of patients who were born in 1980 and were diagnosed with RA are women?", + "evidence": "born in 1980 refers to YEAR(BIRTHDAY) = '1980'; 'RA' refers to diagnosis; women refers to SEX = 'F'; calculation = DIVIDE((SEX = 'F'), COUNT(SEX)) where YEAR(BIRTHDAY) = '1980' AND diagnosis = 'RA' MULTIPLY 100", + "SQL": "SELECT CAST(SUM(CASE WHEN SEX = 'F' THEN 1 ELSE 0 END) AS REAL) * 100 / COUNT(ID) FROM Patient WHERE Diagnosis = 'RA' AND STRFTIME('%Y', Birthday) = '1980'", + "difficulty": "moderate" + }, + { + "question_id": 1213, + "db_id": "thrombosis_prediction", + "question": "Name the ID of the patient who is born on the April 1st, 1982. Is his/her alkaliphophatase (ALP) within normal range?", + "evidence": "alkaliphophatase (ALP) within normal range refers to ALP < 300", + "SQL": "SELECT T1.ID , CASE WHEN T2.ALP < 300 THEN 'normal' ELSE 'abNormal' END FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T1.Birthday = '1982-04-01'", + "difficulty": "moderate" + }, + { + "question_id": 1217, + "db_id": "thrombosis_prediction", + "question": "For all patient born in 1982, state if their albumin is within normal range.", + "evidence": "Year(Birthday) = '1982'; albumin is within normal range refers to ALB between 3.5 and 5.5", + "SQL": "SELECT CASE WHEN T2.ALB >= 3.5 AND T2.ALB <= 5.5 THEN 'normal' ELSE 'abnormal' END FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE STRFTIME('%Y', T1.Birthday) = '1982'", + "difficulty": "moderate" + }, + { + "question_id": 1224, + "db_id": "thrombosis_prediction", + "question": "What is the highest total bilirubin level recorded? List out the patient details with ID, sex and birthday with that index.", + "evidence": "the highest total bilirubin refers to MAX(T-BIL)", + "SQL": "SELECT T2.`T-BIL`, T1.ID, T1.SEX, T1.Birthday FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID ORDER BY T2.`T-BIL` DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 1247, + "db_id": "thrombosis_prediction", + "question": "Among the male patients who have a normal level of white blood cells, how many of them have an abnormal fibrinogen level?", + "evidence": "male patients refers to Sex = 'M'; normal level of white blood cells refers to WBC between 3.5 and 9.0; abnormal fibrinogen level refers to FG < = 150 or FG > = 450; Don't compute repetitive ones.", + "SQL": "SELECT COUNT(DISTINCT T1.ID) FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.FG <= 150 OR T2.FG >= 450 AND T2.WBC > 3.5 AND T2.WBC < 9.0 AND T1.SEX = 'M'", + "difficulty": "challenging" + }, + { + "question_id": 1249, + "db_id": "thrombosis_prediction", + "question": "Please list the disease names of the patients that have a proteinuria level higher than normal.", + "evidence": "disease names refers to Diagnosis; proteinuria level higher than normal refers to `U-PRO` > = 30;", + "SQL": "SELECT T1.Diagnosis FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.`U-PRO` >= 30", + "difficulty": "simple" + }, + { + "question_id": 1266, + "db_id": "thrombosis_prediction", + "question": "Which is the youngest patient with an abnormal anti-ribonuclear protein level? Please list his or her date of birth.", + "evidence": "youngest patient refers to MAX(Birthday); abnormal anti-ribonuclear protein level refers to RNP NOT IN('-', '+-'); date of birth refers to Birthday;", + "SQL": "SELECT T1.Birthday FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.RNP != '-' OR '+-' ORDER BY T1.Birthday DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1268, + "db_id": "thrombosis_prediction", + "question": "For the patients with an abnormal anti-SM, please list the IDs of the three youngest ones.", + "evidence": "abnormal anti-SM refers to SM NOT IN('-', '+-'); youngest refers to MAX(Birthday);", + "SQL": "SELECT T1.ID FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.SM NOT IN ('negative','0') ORDER BY T1.Birthday DESC LIMIT 3", + "difficulty": "simple" + }, + { + "question_id": 1272, + "db_id": "thrombosis_prediction", + "question": "Which patient is the first patient with an abnormal anti-SSA to come to the hospital? Please give his or her ID.", + "evidence": "first patient refers to ID with MIN(`First Date`); abnormal anti-SSA refers to SSA NOT IN('-', '+-');", + "SQL": "SELECT T1.ID FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T1.`First Date` IS NOT NULL AND T2.SSA NOT IN ('negative', '0') ORDER BY T1.`First Date` ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1278, + "db_id": "thrombosis_prediction", + "question": "Of the patients with an abnormal level of anti-DNA-II, how many of them admitted to the hospital?", + "evidence": "normal level of anti-DNA-II refers to DNA-II < 8; admitted to the hospital refers to Admission = '+';", + "SQL": "SELECT COUNT(T1.ID) FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T2.`DNA-II` >= 8 AND T1.Admission = '+'", + "difficulty": "simple" + }, + { + "question_id": 1291, + "db_id": "thrombosis_prediction", + "question": "How many male patients have a normal level of both albumin and total protein?", + "evidence": "male refers to Sex = 'M'; normal level of both albumin and total protein refers to ALB > 3.5 and ALB < 5.5 AND TP between 6.0 and 8.5;", + "SQL": "SELECT COUNT(T1.ID) FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE T1.SEX = 'M' AND T2.ALB BETWEEN 3.5 AND 5.5 AND T2.TP BETWEEN 6.0 AND 8.5", + "difficulty": "moderate" + }, + { + "question_id": 1293, + "db_id": "thrombosis_prediction", + "question": "What is the highest anti-nucleus antibody concentration level of a patient with a normal creatinine level?", + "evidence": "highest anti-nucleus antibody concentration level refers to MAX(ANA); normal creatinine level refers to CRE < 1.5;", + "SQL": "SELECT T2.ANA FROM Patient AS T1 INNER JOIN Examination AS T2 ON T1.ID = T2.ID INNER JOIN Laboratory AS T3 ON T1.ID = T3.ID WHERE T3.CRE < 1.5 ORDER BY T2.ANA DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1319, + "db_id": "student_club", + "question": "Which college is the vice president of the Student_Club from?", + "evidence": "Vice President is a position of the Student Club", + "SQL": "SELECT T2.college FROM member AS T1 INNER JOIN major AS T2 ON T1.link_to_major = T2.major_id WHERE T1.position LIKE 'vice president'", + "difficulty": "simple" + }, + { + "question_id": 1343, + "db_id": "student_club", + "question": "With the biggest budget for the \"Food\", what was the remaining of it?", + "evidence": "remaining of budget refers to remaining, biggest budget for 'Food' refers to MAX(budget.amount) where category = 'Food'", + "SQL": "SELECT remaining FROM budget WHERE category = 'Food' AND amount = ( SELECT MAX(amount) FROM budget WHERE category = 'Food' )", + "difficulty": "simple" + }, + { + "question_id": 1349, + "db_id": "student_club", + "question": "Provide the total number of the budget amount for \"September Speaker\" event.", + "evidence": "'September Speaker' is an event name; total number of budget amount refers to SUM(amount)", + "SQL": "SELECT SUM(T1.amount) FROM budget AS T1 INNER JOIN event AS T2 ON T1.link_to_event = T2.event_id WHERE T2.event_name = 'September Speaker'", + "difficulty": "simple" + }, + { + "question_id": 1366, + "db_id": "student_club", + "question": "List all the members who attended the event \"October Meeting\".", + "evidence": "'October Meeting' is an event name;", + "SQL": "SELECT DISTINCT T3.member_id FROM event AS T1 INNER JOIN attendance AS T2 ON T1.event_id = T2.link_to_event INNER JOIN member AS T3 ON T2.link_to_member = T3.member_id WHERE T1.event_name = 'October Meeting'", + "difficulty": "simple" + }, + { + "question_id": 1368, + "db_id": "student_club", + "question": "What does the person with the phone number \"809-555-3360\" major in?", + "evidence": "major in refers to major_name", + "SQL": "SELECT T2.major_name FROM member AS T1 INNER JOIN major AS T2 ON T1.link_to_major = T2.major_id WHERE T1.phone = '809-555-3360'", + "difficulty": "simple" + }, + { + "question_id": 1393, + "db_id": "student_club", + "question": "Provide the full name and email address of the Student_Club's Secretary.", + "evidence": "full name refers to first_name, last_name; 'Secretary' is a position of Student Club", + "SQL": "SELECT first_name, last_name, email FROM member WHERE position = 'Secretary'", + "difficulty": "simple" + }, + { + "question_id": 1411, + "db_id": "student_club", + "question": "State what kind of expenses that Sacha Harrison incurred?", + "evidence": "kind of expenses refers to expense_description", + "SQL": "SELECT T2.expense_description FROM member AS T1 INNER JOIN expense AS T2 ON T1.member_id = T2.link_to_member WHERE T1.first_name = 'Sacha' AND T1.last_name = 'Harrison'", + "difficulty": "simple" + }, + { + "question_id": 1415, + "db_id": "student_club", + "question": "List out the position of members who joined major of Business.", + "evidence": "'Business' is the major name", + "SQL": "SELECT T2.position FROM major AS T1 INNER JOIN member AS T2 ON T1.major_id = T2.link_to_major WHERE T1.major_name = 'Business'", + "difficulty": "simple" + }, + { + "question_id": 1430, + "db_id": "student_club", + "question": "What is the last name and position of the student that bought pizza on 09/10/2019?", + "evidence": "bought pizza on 09/10/2019 refers to expense_description = 'Pizza' where expense_date = '2019-09-10'", + "SQL": "SELECT T1.last_name, T1.position FROM member AS T1 INNER JOIN expense AS T2 ON T1.member_id = T2.link_to_member WHERE T2.expense_date = '2019-09-10' AND T2.expense_description = 'Pizza'", + "difficulty": "moderate" + }, + { + "question_id": 1438, + "db_id": "student_club", + "question": "Please indicate the college of the person whose first name is Katy with the link to the major \"rec1N0upiVLy5esTO\".", + "evidence": "", + "SQL": "SELECT T2.college FROM member AS T1 INNER JOIN major AS T2 ON T2.major_id = T1.link_to_major WHERE T1.link_to_major = 'rec1N0upiVLy5esTO' AND T1.first_name = 'Katy'", + "difficulty": "simple" + }, + { + "question_id": 1447, + "db_id": "student_club", + "question": "List the name and location of events that underspend its budget.", + "evidence": "name of event refers to event_name; underspend its budget refers to remaining > 0", + "SQL": "SELECT DISTINCT T1.event_name, T1.location FROM event AS T1 INNER JOIN budget AS T2 ON T1.event_id = T2.link_to_event WHERE T2.remaining > 0", + "difficulty": "simple" + }, + { + "question_id": 1453, + "db_id": "student_club", + "question": "List the name of events with less than average parking cost.", + "evidence": "name of events refers to event_name; less than average parking cost refers to cost < DIVIDE(SUM(cost), COUNT(event_id)) where category = 'Parking'", + "SQL": "SELECT T1.event_name FROM event AS T1 INNER JOIN budget AS T2 ON T1.event_id = T2.link_to_event INNER JOIN expense AS T3 ON T2.budget_id = T3.link_to_budget WHERE T2.category = 'Parking' AND T3.cost < (SELECT AVG(cost) FROM expense)", + "difficulty": "moderate" + }, + { + "question_id": 1454, + "db_id": "student_club", + "question": "What is the percentage of the cost for the game events?", + "evidence": "game events refers to type = 'Game'; percentage = DIVIDE( SUM(cost), COUNT(event_id)) * 100", + "SQL": "SELECT SUM(CASE WHEN T1.type = 'Game' THEN T3.cost ELSE 0 END) * 100 / SUM(T3.cost) FROM event AS T1 INNER JOIN budget AS T2 ON T1.event_id = T2.link_to_event INNER JOIN expense AS T3 ON T2.budget_id = T3.link_to_budget", + "difficulty": "moderate" + }, + { + "question_id": 1466, + "db_id": "student_club", + "question": "Write the full name of the club member with the position of 'Secretary' and list which college the club member belongs to.", + "evidence": "full name refers to first_name, last name", + "SQL": "SELECT T1.first_name, T1.last_name, college FROM member AS T1 INNER JOIN major AS T2 ON T2.major_id = T1.link_to_major WHERE T1.position = 'Secretary'", + "difficulty": "simple" + }, + { + "question_id": 1467, + "db_id": "student_club", + "question": "Calculate the total amount spent on speaker gifts and list the name of the event they were spent on.", + "evidence": "total amount spent = SUM(spent) where category = 'Speaker Gifts'", + "SQL": "SELECT SUM(T1.spent), T2.event_name FROM budget AS T1 INNER JOIN event AS T2 ON T1.link_to_event = T2.event_id WHERE T1.category = 'Speaker Gifts'", + "difficulty": "simple" + }, + { + "question_id": 1472, + "db_id": "debit_card_specializing", + "question": "In 2012, who had the least consumption in LAM?", + "evidence": "Year 2012 can be presented as Between 201201 And 201212, which means between January and December in 2012", + "SQL": "SELECT T1.CustomerID FROM customers AS T1 INNER JOIN yearmonth AS T2 ON T1.CustomerID = T2.CustomerID WHERE T1.Segment = 'LAM' AND T2.date BETWEEN 201201 AND 201212 GROUP BY T1.CustomerID ORDER BY SUM(T2.Consumption) ASC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1479, + "db_id": "debit_card_specializing", + "question": "Which year recorded the most consumption of gas paid in CZK?", + "evidence": "The first 4 strings of the values in the table yearmonth can represent year.", + "SQL": "SELECT SUBSTRING(T2.Date, 1, 4) FROM customers AS T1 INNER JOIN yearmonth AS T2 ON T1.CustomerID = T2.CustomerID WHERE T1.Currency = 'CZK' GROUP BY SUBSTRING(T2.Date, 1, 4) ORDER BY SUM(T2.Consumption) DESC LIMIT 1", + "difficulty": "moderate" + }, + { + "question_id": 1485, + "db_id": "debit_card_specializing", + "question": "How much more was customer 7 consuming in April 2013 than customer 5?", + "evidence": "April 2013 refers to 201304 in the yearmonth.date", + "SQL": "SELECT SUM(IIF(CustomerID = 7, Consumption, 0)) - SUM(IIF(CustomerID = 5, Consumption, 0)) FROM yearmonth WHERE Date = '201304'", + "difficulty": "simple" + }, + { + "question_id": 1511, + "db_id": "debit_card_specializing", + "question": "For the customers who paid in the euro, what is their average total price of the transactions?", + "evidence": "", + "SQL": "SELECT AVG(T1.Price) FROM transactions_1k AS T1 INNER JOIN gasstations AS T2 ON T1.GasStationID = T2.GasStationID INNER JOIN customers AS T3 ON T1.CustomerID = T3.CustomerID WHERE T3.Currency = 'EUR'", + "difficulty": "simple" + }, + { + "question_id": 1527, + "db_id": "debit_card_specializing", + "question": "Which gas station has the highest amount of revenue?", + "evidence": "", + "SQL": "SELECT GasStationID FROM transactions_1k GROUP BY GasStationID ORDER BY SUM(Price) DESC LIMIT 1", + "difficulty": "simple" + }, + { + "question_id": 1533, + "db_id": "debit_card_specializing", + "question": "For all the people who paid more than 29.00 per unit of product id No.5. Give their consumption status in the August of 2012.", + "evidence": "August of 2012 means Date contains '201208' in the yearmonth.date of the database; Price per unit of product = Price / Amount", + "SQL": "SELECT T2.Consumption FROM transactions_1k AS T1 INNER JOIN yearmonth AS T2 ON T1.CustomerID = T2.CustomerID WHERE T1.Price / T1.Amount > 29.00 AND T1.ProductID = 5 AND T2.Date = '201208'", + "difficulty": "moderate" + } +] \ No newline at end of file -- GitLab From 240c74d3bd123efd1dce101e2ff0c790d137d4eb Mon Sep 17 00:00:00 2001 From: stepmitt <st3pmitt@gmail.com> Date: Mon, 3 Mar 2025 14:41:28 +0100 Subject: [PATCH 07/12] implement sampling count --- .../run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml | 1 + .../candidate_generator/tool_kit/revise.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml index 7d045b2..e12e34b 100644 --- a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml +++ b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml @@ -60,3 +60,4 @@ team_agents: engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct' temperature: 0.0 parser_name: 'revise' + sampling_count: 3 diff --git a/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py b/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py index b8bd115..ce11f7e 100644 --- a/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py +++ b/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py @@ -7,17 +7,19 @@ from database_utils.execution import ExecutionStatus from workflow.system_state import SystemState from workflow.sql_meta_info import SQLMetaInfo from workflow.agents.tool import Tool +from runner.database_manager import DatabaseManager class Revise(Tool): """ Tool for correcting a SQL query that returns empty set or has a syntax error. """ - def __init__(self, template_name: str = None, engine_config: str = None, parser_name: str = None): + def __init__(self, template_name: str = None, engine_config: str = None, parser_name: str = None, sampling_count: int = 1): super().__init__() self.template_name = template_name self.engine_config = engine_config self.parser_name = parser_name + self.sampling_count = sampling_count def _run(self, state: SystemState): @@ -68,22 +70,34 @@ class Revise(Tool): engine=get_llm_chain(**self.engine_config), parser=get_parser(self.parser_name), request_list=request_list, - step=self.tool_name + step=self.tool_name, + sampling_count=self.sampling_count # CHANGED ) response = [r[0] for r in response] except Exception as e: print(f"Error in Checker while getting response: {e}") response = [] + index = 0 + revised_sqls = [res["revised_SQL"] for res in response] + # For all candidates, bei uns nur N=1 for target_SQL_meta_info in target_SQL_meta_infos: try: + # Wenn der candidate eine revision braucht... if target_SQL_meta_info.need_fixing: refinement_response = response[index] index += 1 + + revised_sql = DatabaseManager().aggregate_sqls(sqls=revised_sqls) + chosen_res = next(res for res in response if res["revised_SQL"] == revised_sql) + refinement_response = { "refined_sql_query": chosen_res["revised_SQL"] } + + ''' if "SELECT" not in refinement_response["refined_sql_query"]: refinement_response = { "refined_sql_query": target_SQL_meta_info.SQL } + ''' else: refinement_response = { "refined_sql_query": target_SQL_meta_info.SQL @@ -93,6 +107,7 @@ class Revise(Tool): refinement_response = { "refined_sql_query": target_SQL_meta_info.SQL } + if "refined_sql_query" in refinement_response: if refinement_response["refined_sql_query"]: state.SQL_meta_infos[SQL_id].append(SQLMetaInfo(**{ -- GitLab From 7d99559aed45de4a5b51270a5f050be1f27783ce Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Wed, 5 Mar 2025 08:36:24 +0100 Subject: [PATCH 08/12] Changed minor things in scripts - informative part at the start - CUDA_VISIBLE devices now exported - vllm proper context length now also on download, shouldn't matter much since on execution, it was correct already --- scripts/copyrepo.sh | 9 +++++---- scripts/runchess.sh | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/copyrepo.sh b/scripts/copyrepo.sh index cd423b3..3698539 100644 --- a/scripts/copyrepo.sh +++ b/scripts/copyrepo.sh @@ -38,9 +38,10 @@ if [ ! -f $2 ]; then fi # Useful debug Information -srun echo "Tested: <running CHESS on SDS-BIRD>" # place a little reminder on what was tested here. -srun hostname -srun nvidia-smi +srun --ntasks=1 echo "Tested: <running CHESS on SDS-BIRD>" # place a little reminder on what was tested here. +srun --ntasks=1 hostname +srun --ntasks=1 nvidia-smi +srun --ntasks=1 echo ${CUDA_VISIBLE_DEVICES} # Create user directory on /zpool1/slurm_data/ if it doesn't exist yet srun --ntasks=1 $(if [ -d /zpool1/slurm_data/${KUERZEL} ]; then rm -r /zpool1/slurm_data/${KUERZEL}/*; else mkdir /zpool1/slurm_data/${KUERZEL}; fi) @@ -127,7 +128,7 @@ source "${VENVPATH}/bin/activate" ; srun --ntasks=1 pip3.11 install -r "${RECHESSROOT}/CHESS/requirements.txt" # Load nl2sql model via vllm -srun --time='00:05:00' --ntasks=1 ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 16384 --download-dir "${RECHESSROOT}/vllm" +srun --time='00:05:00' --ntasks=1 ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 8192 --download-dir "${RECHESSROOT}/vllm" srun --ntasks=1 ls -la "${VENVPATH}/lib/${PYTHONVER}/site-packages" deactivate # srun which pip3.11 ; # for debugging purposes diff --git a/scripts/runchess.sh b/scripts/runchess.sh index e62855f..2f19561 100755 --- a/scripts/runchess.sh +++ b/scripts/runchess.sh @@ -17,13 +17,14 @@ gpuzero=$(echo $gpulist | cut -c 1-1) gpuone=$(echo $gpulist | cut -c 3-3) echo $gpulist ' - ' $gpuzero ' - ' $gpuone -CUDA_VISIBLE_DEVICES=$gpuone +export CUDA_VISIBLE_DEVICES=$gpuone ${RECHESSROOT}/ollama/bin/ollama serve & date; sleep 30; date; -CUDA_VISIBLE_DEVICES=$gpuzero +export CUDA_VISIBLE_DEVICES=$gpuzero +echo ${CUDA_VISIBLE_DEVICES} #${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 8192 --download-dir "${RECHESSROOT}/vllm" & ${PYTHONVER} -m vllm.entrypoints.openai.api_server --model "AI4DS/NL2SQL_DeepSeek_33B" --load-format safetensors --dtype bfloat16 --max-model-len 8192 --download-dir "${RECHESSROOT}/vllm" 1> ${VLLMLOGDUMP} & @@ -39,5 +40,6 @@ run/run_main_ir_ss_cg_bird-oss.sh # run/run_main_ir_ss_cg_spider2-oss.sh kill %1 %2 +export CUDA_VISIBLE_DEVICES=$gpuzero,$gpuone cd ${OLDERPWD} deactivate -- GitLab From d98f6f91a6b0ddc17110727bac31eeea546fa475 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 31 Mar 2025 15:43:25 +0000 Subject: [PATCH 09/12] Added commented out Llama3-70B to preprocess.py --- CHESS/src/database_utils/db_catalog/preprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/CHESS/src/database_utils/db_catalog/preprocess.py b/CHESS/src/database_utils/db_catalog/preprocess.py index 27cfda5..95493e2 100644 --- a/CHESS/src/database_utils/db_catalog/preprocess.py +++ b/CHESS/src/database_utils/db_catalog/preprocess.py @@ -32,6 +32,7 @@ if GCP_CREDENTIALS and GCP_PROJECT and GCP_REGION: # EMBEDDING_FUNCTION = OpenAIEmbeddings(model="text-embedding-3-large") # EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3.2") EMBEDDING_FUNCTION = OllamaEmbeddings(model="mxbai-embed-large") +# EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3:70b") def make_db_context_vec_db(db_directory_path: str, **kwargs) -> None: -- GitLab From b236282c42ecb8e3980e5e3fc528ba9696ae8876 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 31 Mar 2025 15:51:09 +0000 Subject: [PATCH 10/12] Added commented out Llama3-70B to retrieve_entity.py --- .../agents/information_retriever/tool_kit/retrieve_entity.py | 1 + 1 file changed, 1 insertion(+) diff --git a/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py b/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py index d304eb4..8745ace 100644 --- a/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py +++ b/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py @@ -35,6 +35,7 @@ class RetrieveEntity(Tool): # self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-small") # self.embedding_function = OllamaEmbeddings(model="llama3.2") self.embedding_function = OllamaEmbeddings(model="nomic-embed-text") + # self.embedding_function = OllamaEmbeddings(model="llama3:70b") self.edit_distance_threshold = 0.3 self.embedding_similarity_threshold = 0.6 -- GitLab From 907940da7ffb210d707e25d4d89a631c8d7364bf Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 31 Mar 2025 16:26:40 +0000 Subject: [PATCH 11/12] Added commented out `DATA_PATH` for full BIRD dev set to .env-rechess-bird-sds --- CHESS/.env-rechess-bird-sds | 1 + 1 file changed, 1 insertion(+) diff --git a/CHESS/.env-rechess-bird-sds b/CHESS/.env-rechess-bird-sds index 9400a7f..9969865 100644 --- a/CHESS/.env-rechess-bird-sds +++ b/CHESS/.env-rechess-bird-sds @@ -3,6 +3,7 @@ OPENAI_API_KEY="OPEN AI API KEY" DB_ROOT_PATH="./data/BIRD/dev" # this directory should be the parent of test_databases DATA_MODE="dev" +# DATA_PATH="./data/BIRD/dev/dev.json" DATA_PATH="./data/BIRD/dev/sub_sampled_bird_dev_set.json" DB_ROOT_DIRECTORY="./data/BIRD/dev/dev_databases" DATA_TABLES_PATH="./data/BIRD/dev/dev_tables.json" -- GitLab From d5a52e8ccabc6c83422313f8a094b10437525925 Mon Sep 17 00:00:00 2001 From: Erik Jonas Hartnick <erik.hartnick@student.uni-halle.de> Date: Mon, 31 Mar 2025 16:56:13 +0000 Subject: [PATCH 12/12] Edit README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index d4ba120..47842e7 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,6 @@ Original-repo: [Link](https://github.com/ShayanTalaei/CHESS) Paper: [Link](https://arxiv.org/abs/2405.16755) -ShareLatex-Project: https://sharelatex.informatik.uni-halle.de/3866392724cmvnzhyfwdkk - ## Datasets BIRD: [Link](https://bird-bench.github.io/) -- GitLab