Compare revisions

Erik Jonas Hartnick · Erik Jonas Hartnick · Erik Jonas Hartnick · Erik Jonas Hartnick · Erik Jonas Hartnick · Erik Jonas Hartnick
--- a/.gitignore
+++ b/.gitignore
+ollama/*
+!ollama/.gitignore
+!ollama/models
+!ollama/models/.gitignore
+CHESS/.env.ollama3.2
+slurm*.err
+slurm*.out
--- a/CHESS/.env-rechess-bird-sds
+++ b/CHESS/.env-rechess-bird-sds
+OPENAI_API_KEY="OPEN AI API KEY"
+
+DB_ROOT_PATH="./data/BIRD/dev" # this directory should be the parent of test_databases
+
+DATA_MODE="dev"
+# DATA_PATH="./data/BIRD/dev/dev.json"
+DATA_PATH="./data/BIRD/dev/sub_sampled_bird_dev_set.json"
+DB_ROOT_DIRECTORY="./data/BIRD/dev/dev_databases"
+DATA_TABLES_PATH="./data/BIRD/dev/dev_tables.json"
+INDEX_SERVER_HOST='localhost'
+INDEX_SERVER_PORT=12345
+
+OPENAI_API_KEY='EMPTY'
+GCP_PROJECT=''
+GCP_REGION='us-central1'
+GCP_CREDENTIALS=''
+GOOGLE_CLOUD_PROJECT=''
+
+# PATH="$PATH:$PWD/ollama/bin"
+# OLLAMA_HOST="127.0.0.1:11434"
+# OLLAMA_MODELS="~/.ollama/models"
+
--- a/CHESS/.env.llama3.2.example
+++ b/CHESS/.env.llama3.2.example
+DB_ROOT_PATH="./data/dev" # this directory should be the parent of test_databases
+
+DATA_MODE="dev"
+DATA_PATH="./data/dev/dev.json"
+DB_ROOT_DIRECTORY="./data/dev/dev_databases"
+DATA_TABLES_PATH="./data/dev/dev_tables.json"
+INDEX_SERVER_HOST='localhost'
+INDEX_SERVER_PORT=12345
+
+OPENAI_API_KEY="OPEN AI API KEY"
+GCP_PROJECT=''
+GCP_REGION='us-central1'
+GCP_CREDENTIALS=''
+GOOGLE_CLOUD_PROJECT=''
+
+# uncomment these and configure Ollama to source before running ollama serve, pull or run
+# PATH="$PATH:$PWD/ollama/bin"
+# OLLAMA_HOST="127.0.0.1:11434"
+# OLLAMA_MODELS="~/.ollama/models"
+
--- a/CHESS/data/dev/sub_sampled_bird_dev_set.json
+++ b/CHESS/data/dev/sub_sampled_bird_dev_set.json
--- a/CHESS/requirements.txt
+++ b/CHESS/requirements.txt
@@ -11,6 +11,7 @@ langchain-community==0.2.6
 langchain-google-genai==1.0.9
 langchain_google_vertexai==1.0.8
 google-cloud-aiplatform==1.63.0
+langchain_ollama==0.1.3
 langchain-openai==0.1.12
 langgraph==0.1.4
 pandas==2.2.1
@@ -25,4 +26,6 @@ websockets==12.0
 filelock==3.15.4
 faiss-cpu==1.8.0
 datasets==2.21.0
-pyyaml==6.0.2
\ No newline at end of file
+pyyaml==6.0.2
+vllm==0.3.3
+outlines==0.0.33
--- a/CHESS/run/configs/CHESS_IR_CG_UT_LLAMA3-2.yaml
+++ b/CHESS/run/configs/CHESS_IR_CG_UT_LLAMA3-2.yaml
+setting_name: CHESS_IR_CG_UT
+
+team_agents:
+  information_retriever:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      extract_keywords:
+        template_name: 'extract_keywords'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.2
+        parser_name: 'python_list_output_parser'
+      retrieve_entity: {}
+      retrieve_context:
+        top_k: 5
+
+  candidate_generator:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      generate_candidate:
+        generator_configs:
+          - template_name: 'generate_candidate_one'
+            engine_config:
+              #engine_name: 'gpt-4o-mini'
+              engine_name: 'meta-llama/llama3-2'
+              temperature: 0.5
+            parser_name: 'generate_candidate_gemini_markdown_cot'
+            sampling_count: 10
+          - template_name: 'generate_candidate_two'
+            engine_config:
+              #engine_name: 'gpt-4o-mini'
+              engine_name: 'meta-llama/llama3-2'
+              temperature: 0.5
+            parser_name: 'generate_candidate_gemini_markdown_cot'
+            sampling_count: 10
+
+      revise:
+        template_name: 'revise_one'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'revise_new'
+
+  unit_tester:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      generate_unit_test:
+        template_name: 'generate_unit_tests'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.8
+        parser_name: 'generate_unit_tests'
+        unit_test_count: 20
+        sampling_count: 1
+
+      evaluate:
+        template_name: 'evaluate'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'evaluate'
--- a/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml
+++ b/CHESS/run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml
+setting_name: CHESS_IR_SS_CG_BIRD_OSS
+
+team_agents:
+  information_retriever:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      extract_keywords:
+        template_name: 'extract_keywords'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.2
+        parser_name: 'python_list_output_parser'
+      retrieve_entity: {}
+      retrieve_context:
+        top_k: 5
+
+  schema_selector:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      filter_column:
+        template_name: 'filter_column'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'filter_column'
+
+      select_tables:
+        mode: 'ask_model'
+        template_name: 'select_tables'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'select_tables'
+
+      select_columns:
+        mode: 'ask_model'
+        template_name: 'select_columns'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'select_columns'
+
+  candidate_generator:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      generate_candidate:
+        generator_configs:
+          - template_name: 'generate_candidate_finetuned'
+            # - template_name: 'generate_candidate_one'
+            engine_config:
+              engine_name: 'finetuned_nl2sql'
+              temperature: 0.01
+            #parser_name: 'generate_candidate_gemini_markdown_cot'
+            parser_name: 'generated_candidate_finetuned'
+            sampling_count: 1
+
+      revise:
+        template_name: 'revise_two'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'revise'
+        sampling_count: 3
--- a/CHESS/run/configs/CHESS_IR_SS_CG_LLAMA3-2.yaml
+++ b/CHESS/run/configs/CHESS_IR_SS_CG_LLAMA3-2.yaml
+setting_name: CHESS_IR_SS_CG
+
+team_agents:
+  information_retriever:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      extract_keywords:
+        template_name: 'extract_keywords'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.2
+        parser_name: 'python_list_output_parser'
+      retrieve_entity: {}
+      retrieve_context:
+        top_k: 5
+
+  schema_selector:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      filter_column:
+        template_name: 'filter_column'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'filter_column'
+
+      select_tables:
+        mode: 'ask_model'
+        template_name: 'select_tables'
+        engine_config:
+          #engine_name: 'gpt-4o'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'select_tables'
+
+      select_columns:
+        mode: 'ask_model'
+        template_name: 'select_columns'
+        engine_config:
+          #engine_name: 'gpt-4o'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'select_columns'
+
+  candidate_generator:
+    #engine: 'gpt-4o-mini'
+    engine: 'meta-llama/llama3-2'
+    tools:
+      generate_candidate:
+        generator_configs:
+          - template_name: 'generate_candidate_one'
+            engine_config:
+              #engine_name: 'gpt-4o-mini'
+              engine_name: 'meta-llama/llama3-2'
+              temperature: 0.01
+            parser_name: 'generate_candidate_gemini_markdown_cot'
+            sampling_count: 1
+
+      revise:
+        template_name: 'revise_one'
+        engine_config:
+          #engine_name: 'gpt-4o-mini'
+          engine_name: 'meta-llama/llama3-2'
+          temperature: 0.0
+        parser_name: 'revise_new'
--- a/CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml
+++ b/CHESS/run/configs/CHESS_IR_SS_CG_SPIDER_OSS.yaml
+setting_name: CHESS_IR_SS_CG_SPIDER
+
+team_agents:
+  information_retriever:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      extract_keywords:
+        template_name: 'extract_keywords'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.2
+        parser_name: 'python_list_output_parser'
+      retrieve_entity: {}
+      #retrieve_context:
+        #top_k: 5
+
+  schema_selector:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      filter_column:
+        template_name: 'filter_column'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'filter_column'
+
+      select_tables:
+        mode: 'ask_model'
+        template_name: 'select_tables'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'select_tables'
+
+      select_columns:
+        mode: 'ask_model'
+        template_name: 'select_columns'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'select_columns'
+
+  candidate_generator:
+    engine: 'meta-llama/Meta-Llama-3-70B-Instruct'
+    tools:
+      generate_candidate:
+        generator_configs:
+          - template_name: 'generate_candidate_one'
+            engine_config:
+              engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+              temperature: 0.01
+            parser_name: 'generate_candidate_gemini_markdown_cot'
+            sampling_count: 1
+
+      revise:
+        template_name: 'revise_one'
+        engine_config:
+          engine_name: 'meta-llama/Meta-Llama-3-70B-Instruct'
+          temperature: 0.0
+        parser_name: 'revise_new'
--- a/CHESS/run/run_main_ir_cg_ut_llama3.2.sh
+++ b/CHESS/run/run_main_ir_cg_ut_llama3.2.sh
+source .env
+data_mode=$DATA_MODE # Options: 'dev', 'train' 
+data_path=$DATA_PATH # UPDATE THIS WITH THE PATH TO THE TARGET DATASET
+
+config="./run/configs/CHESS_IR_CG_UT_LLAMA3-2.yaml"
+
+num_workers=1 # Number of workers to use for parallel processing, set to 1 for no parallel processing
+
+python3 -u ./src/main.py --data_mode ${data_mode} --data_path ${data_path} --config "$config" \
+        --num_workers ${num_workers} --pick_final_sql true 
+
--- a/CHESS/run/run_main_ir_ss_cg_bird-oss.sh
+++ b/CHESS/run/run_main_ir_ss_cg_bird-oss.sh
+source .env
+data_mode=$DATA_MODE # Options: 'dev', 'train' 
+data_path=$DATA_PATH # UPDATE THIS WITH THE PATH TO THE TARGET DATASET
+
+config="./run/configs/CHESS_IR_SS_CG_BIRD_OSS.yaml"
+
+num_workers=1 # Number of workers to use for parallel processing, set to 1 for no parallel processing
+
+python3 -u ./src/main.py --data_mode ${data_mode} --data_path ${data_path} --config "$config" \
+        --num_workers ${num_workers} --pick_final_sql true 
+
--- a/CHESS/run/run_main_ir_ss_cg_llama3.2.sh
+++ b/CHESS/run/run_main_ir_ss_cg_llama3.2.sh
+source .env
+data_mode=$DATA_MODE # Options: 'dev', 'train' 
+data_path=$DATA_PATH # UPDATE THIS WITH THE PATH TO THE TARGET DATASET
+
+config="./run/configs/CHESS_IR_SS_CG_LLAMA3-2.yaml"
+
+num_workers=1 # Number of workers to use for parallel processing, set to 1 for no parallel processing
+
+python3 -u ./src/main.py --data_mode ${data_mode} --data_path ${data_path} --config "$config" \
+        --num_workers ${num_workers} --pick_final_sql true 
+
--- a/CHESS/run/run_preprocess.sh
+++ b/CHESS/run/run_preprocess.sh
--- a/CHESS/src/database_utils/db_catalog/preprocess.py
+++ b/CHESS/src/database_utils/db_catalog/preprocess.py
@@ -6,6 +6,7 @@ from langchain_chroma import Chroma
 from langchain.schema.document import Document
 from langchain_openai import OpenAIEmbeddings
 from langchain_google_vertexai import VertexAIEmbeddings
+from langchain_ollama import OllamaEmbeddings
 from google.oauth2 import service_account
 from google.cloud import aiplatform
 import vertexai
@@ -28,7 +29,10 @@ if GCP_CREDENTIALS and GCP_PROJECT and GCP_REGION:


 # EMBEDDING_FUNCTION = VertexAIEmbeddings(model_name="text-embedding-004")#OpenAIEmbeddings(model="text-embedding-3-large")
-EMBEDDING_FUNCTION = OpenAIEmbeddings(model="text-embedding-3-large")
+# EMBEDDING_FUNCTION = OpenAIEmbeddings(model="text-embedding-3-large")
+# EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3.2")
+EMBEDDING_FUNCTION = OllamaEmbeddings(model="mxbai-embed-large")
+# EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3:70b")


 def make_db_context_vec_db(db_directory_path: str, **kwargs) -> None:

--- a/CHESS/src/llm/engine_configs.py
+++ b/CHESS/src/llm/engine_configs.py
@@ -2,6 +2,7 @@ from langchain_openai import ChatOpenAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_vertexai import VertexAI
+from langchain_ollama import ChatOllama
 from google.oauth2 import service_account
 from google.cloud import aiplatform
 from typing import Dict, Any
@@ -97,26 +98,26 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = {
        "constructor": ChatAnthropic,
        "params": {"model": "claude-3-opus-20240229", "temperature": 0}
    },
-    # "finetuned_nl2sql": {
-    #     "constructor": ChatOpenAI,
-    #     "params": {
-    #         "model": "AI4DS/NL2SQL_DeepSeek_33B",
-    #         "openai_api_key": "EMPTY",
-    #         "openai_api_base": "/v1",
-    #         "max_tokens": 400,
-    #         "temperature": 0,
-    #         "stop": ["```\n", ";"]
-    #     }
-    # },
    "finetuned_nl2sql": {
        "constructor": ChatOpenAI,
        "params": {
-            "model": "ft:gpt-4o-mini-2024-07-18:stanford-university::9p4f6Z4W",
+            "model": "AI4DS/NL2SQL_DeepSeek_33B",
+            "openai_api_key": "EMPTY",
+            "openai_api_base": "http://localhost:8000/v1",
            "max_tokens": 400,
            "temperature": 0,
            "stop": ["```\n", ";"]
        }
    },
+    # "finetuned_nl2sql": {
+    #     "constructor": ChatOpenAI,
+    #     "params": {
+    #         "model": "ft:gpt-4o-mini-2024-07-18:stanford-university::9p4f6Z4W",
+    #         "max_tokens": 400,
+    #         "temperature": 0,
+    #         "stop": ["```\n", ";"]
+    #     }
+    # },
    "column_selection_finetuning": {
        "constructor": ChatOpenAI,
        "params": {
@@ -146,17 +147,47 @@ ENGINE_CONFIGS: Dict[str, Dict[str, Any]] = {
    #         "stop": [";"]
    #     }
    # },
+    # "meta-llama/Meta-Llama-3-70B-Instruct": {
+    #     "constructor": ChatOpenAI,
+    #     "params": {
+    #         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+    #         "openai_api_key": "EMPTY",
+    #         "openai_api_base": "/v1",
+    #         "max_tokens": 600,
+    #         "temperature": 0,
+    #         "model_kwargs": {
+    #             "stop": [""]
+    #         }
+    #     }
+    # },
    "meta-llama/Meta-Llama-3-70B-Instruct": {
-        "constructor": ChatOpenAI,
+        "constructor": ChatOllama,
+        #"constructor": ChatOpenAI,
        "params": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "openai_api_key": "EMPTY",
-            "openai_api_base": "/v1",
+            "model": "llama3:70b",
+            #"openai_api_key": "EMPTY",
+            #"openai_api_base": "http://localhost:11434/v1",
            "max_tokens": 600,
+            #"num_predict": 128, # Should be equiv to max_tokens, Ollama is 128
            "temperature": 0,
            "model_kwargs": {
-                "stop": [""]
-            }
+                "stop": ["<|eot_id|>"],
+            },
+            "num_ctx": 8192
        }
    }
+    # "meta-llama/llama3-2": {
+    #     "constructor": ChatOpenAI,
+    #     "params": {
+    #         "model": "llama3.2",
+    #         "openai_api_key": "EMPTY",
+    #         "openai_api_base": "http://localhost:11434/v1",
+    #         "max_tokens": 600,
+    #         "temperature": 0 #,
+    #         # "model_kwargs": {
+    #         #     "stop": [""]
+    #         # }
+    #         
+    #     }
+    # }
 }
--- a/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py
+++ b/CHESS/src/workflow/agents/candidate_generator/tool_kit/revise.py
@@ -7,17 +7,19 @@ from database_utils.execution import ExecutionStatus
 from workflow.system_state import SystemState
 from workflow.sql_meta_info import SQLMetaInfo
 from workflow.agents.tool import Tool
+from runner.database_manager import DatabaseManager

 class Revise(Tool):
    """
    Tool for correcting a SQL query that returns empty set or has a syntax error.
    """

-    def __init__(self, template_name: str = None, engine_config: str = None, parser_name: str = None):
+    def __init__(self, template_name: str = None, engine_config: str = None, parser_name: str = None, sampling_count: int = 1):
        super().__init__()
        self.template_name = template_name
        self.engine_config = engine_config
        self.parser_name = parser_name
+        self.sampling_count = sampling_count
        

    def _run(self, state: SystemState):
@@ -68,22 +70,34 @@ class Revise(Tool):
                engine=get_llm_chain(**self.engine_config),
                parser=get_parser(self.parser_name),
                request_list=request_list,
-                step=self.tool_name
+                step=self.tool_name,
+                sampling_count=self.sampling_count   # CHANGED
            )
            response = [r[0] for r in response]
        except Exception as e:
            print(f"Error in Checker while getting response: {e}")
            response = []
+        
        index = 0
+        revised_sqls = [res["revised_SQL"] for res in response]
+        # For all candidates, bei uns nur N=1
        for target_SQL_meta_info in target_SQL_meta_infos:
            try:
+                # Wenn der candidate eine revision braucht...
                if target_SQL_meta_info.need_fixing:
                    refinement_response = response[index]
                    index += 1
+
+                    revised_sql = DatabaseManager().aggregate_sqls(sqls=revised_sqls)
+                    chosen_res = next(res for res in response if res["revised_SQL"] == revised_sql)
+                    refinement_response = { "refined_sql_query": chosen_res["revised_SQL"] }
+                    
+                    '''
                    if "SELECT" not in refinement_response["refined_sql_query"]:
                        refinement_response = {
                            "refined_sql_query": target_SQL_meta_info.SQL
                        }
+                    '''
                else:
                    refinement_response = {
                        "refined_sql_query": target_SQL_meta_info.SQL
@@ -93,6 +107,7 @@ class Revise(Tool):
                refinement_response = {
                    "refined_sql_query": target_SQL_meta_info.SQL
                }
+
            if "refined_sql_query" in refinement_response:
                if refinement_response["refined_sql_query"]:
                    state.SQL_meta_infos[SQL_id].append(SQLMetaInfo(**{

--- a/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py
+++ b/CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py
@@ -3,6 +3,7 @@ import difflib
 from typing import List, Dict, Any, Tuple, Optional

 from langchain_openai import OpenAIEmbeddings
+from langchain_ollama import OllamaEmbeddings
 from google.oauth2 import service_account
 from google.cloud import aiplatform
 import vertexai
@@ -31,7 +32,10 @@ class RetrieveEntity(Tool):

    def __init__(self):
        super().__init__()
-        self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
+        # self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
+        # self.embedding_function = OllamaEmbeddings(model="llama3.2")
+        self.embedding_function = OllamaEmbeddings(model="nomic-embed-text")
+        # self.embedding_function = OllamaEmbeddings(model="llama3:70b")
        self.edit_distance_threshold = 0.3
        self.embedding_similarity_threshold = 0.6
        
@@ -307,4 +311,4 @@ class RetrieveEntity(Tool):
                
    def _get_updates(self, state: SystemState) -> Dict:
        return {"similar_columns": state.similar_columns, 
-                "schema_with_examples": state.schema_with_examples}
\ No newline at end of file
+                "schema_with_examples": state.schema_with_examples}
--- a/README.md
+++ b/README.md
@@ -6,6 +6,211 @@ Original-repo:  [Link](https://github.com/ShayanTalaei/CHESS)

 Paper: [Link](https://arxiv.org/abs/2405.16755)

-ShareLatex-Project: https://sharelatex.informatik.uni-halle.de/3866392724cmvnzhyfwdkk
+## Datasets

-## ...
\ No newline at end of file
+BIRD: [Link](https://bird-bench.github.io/)
+
+Spider: [Link](https://yale-lily.github.io/spider)
+
+1. We need a path to save our dataset(s) to. The `CHESS` repo provides a `data` directory for this purpose where we can download the dev set of BIRD to work with more or less out of the box, but we assume the path to download to be the placeholder `<dataset-path>` to allow for any other storage options. We create the subdirectories `BIRD` and `Spider` for both datasets:
+
+```bash
+mkdir <dataset-path>/BIRD <dataset-path>/Spider
+```
+
+2. We download the dataset(s) with `curl` or with `wget`, depending on which one is installed on our system, with the `<dataset-path>` from above (skip BIRD train (second line) if not needed):
+
+With `curl`:
+
+```bash
+curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' -o <dataset-path>/BIRD/dev.zip ; \
+curl -L 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' -o <dataset-path>/BIRD/train.zip ; \
+curl -L 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' -o <dataset-path>/Spider/spider_data.zip ; \
+```
+
+With `wget`:
+
+```bash
+wget -O <dataset-path>/BIRD/dev.zip 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip' ; \
+wget -O <dataset-path>/BIRD/train.zip 'https://bird-bench.oss-cn-beijing.aliyuncs.com/train.zip' ; \
+wget -O <dataset-path>/Spider/spider_data.zip 'https://drive.usercontent.google.com/download?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&confirm=t' ; \
+```
+
+3. Extracting the zip archives we just downloaded (adjusting for BIRD/train.zip as needed):
+
+```bash
+unzip <dataset-path>/BIRD/dev.zip -d <dataset-path>/BIRD
+unzip <dataset-path>/BIRD/train.zip -d <dataset-path>/BIRD
+unzip <dataset-path>/Spider/spider_data.zip -d <dataset-path>/Spider
+```
+
+4. For the BIRD dataset, we rename the extracted subdirectory to `dev` for ease of use, then we to extract the databases in a subdirectory separately (skip BIRD train as needed, third line and remove `\`):
+
+```bash
+mv <dataset-path>/BIRD/dev_20240627 <dataset-path>/BIRD/dev ; \
+unzip <dataset-path>/BIRD/dev_databases.zip -d <dataset-path>/BIRD/dev ; \
+unzip <dataset-path>/BIRD/train_databases.zip -d <dataset-path>/BIRD/train
+```
+
+5. To configure the datasets, we copy the `dotenv_copy` file provided by the CHESS authors to a `.env` file (or we copy the `.env.llama3.2.example` to `.env` provided by the replication authors):
+
+Copy `dotenv_copy` to `.env`:
+
+```bash
+cp CHESS/dotenv_copy CHESS/.env
+```
+
+6. Editing the `.env` config with our favourite command line editor, we can now set the dataset locations as follows:
+  - `DB_ROOT_PATH` should be the parent directory of the database directory, e. g. `"<dataset-path>/BIRD/dev"` for the BIRD `dev` dataset, with the subdirectory `dev_databases` (where the `.sqlite` and description `.csv` files are stored)
+  - `DATA_MODE` possible values are `"dev"` or `"train"`, switches between inference mode and train mode
+  - `DATA_PATH` should be the json file, providing the user questions and the expected SQL results, e. g. `"<dataset-path>/BIRD/dev/dev.json"` for the BIRD dev dataset or the reduced SDS dataset provided by the authors of CHESS (in the repository under `"data/dev/sub_sampled_bird_dev_set.json"`)
+  - `DB_ROOT_DIRECTORY` should be the database directory, e. g. `"<dataset-path>/BIRD/dev/dev_databases"` for the BIRD `dev` dataset (where the `.sqlite` and description `.csv` files are stored in subdirectories for each database)
+  - `INDEX_SERVER_HOST` TODO: Find out what that does, left to provided value `"localhost"`
+  - `INDEX_SERVER_PORT` TODO: Find out what that does, left to provided value `12345`
+  - `OPENAI_API_KEY` should be the OpenAI API key, if you plan on using that. Leave it to any other (non-emtpy) value
+  - `GCP_PROJECT` should be set to the Google Cloud Project used for Gemini requests (left empty, i. e. set to value `''` by replication project, unnecessary for Ollama set up)
+  - `GCP_REGION` should be set to the region of the Google Cloud Project used for Gemini requests (left at provided value, i. e. set to value `'us-central1'` by replication project, unnecessary for Ollama set up)
+  - `GCP_CREDENTIALS` should be set to the credentials needed to authorize with the Google Cloud Project used fo Gemini requests (left empty, i. e. set to value `''` by replication project, unnecessary for Ollama set up)
+  - `GOOGLE_CLOUD_PROJECT` probably also should be set to the Google Cloud Project used for Gemini requests (left empty, i. e. set to value `''` by replication project, unnecessary for Ollama set up)
+  - `PATH`, `OLLAMA_HOST` and `OLLAMA_MODELS` optional in `.env.llama3.2` for ease of use with Ollama, can stay commented out
+
+## Installation of Ollama (for Linux)
+
+>Download for mac and windows can be found here: [Link](https://ollama.com/download)
+
+>Documentation for installation under Linux: [general GitHub-Link](https://github.com/ollama/ollama/blob/main/docs/linux.md), [GitHub, Permalink](https://github.com/ollama/ollama/blob/1c198977ecdd471aee827a378080ace73c02fa8d/docs/linux.md)
+>FAQ on execution with a different path to where the model is stored (possibly needed for large persistent data storage on institute server): [general GitHub-Link](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-do-i-set-them-to-a-different-location), [GitHub, Permalink](https://github.com/ollama/ollama/blob/1c198977ecdd471aee827a378080ace73c02fa8d/docs/faq.md#how-do-i-set-them-to-a-different-location)
+
+1. Figuring out which one of `curl` or `wget` is installed with `which curl`, `which wget` (should print an install path), then download with the appropriate command below. We assume that `<download-path>` is a placeholder for the path where the archive file will be stored. We suggest using the provided `ollama` directory for testing, but we can specify any other directory that fits our needs.
+
+with `curl`
+```bash
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o <download-path>/ollama-linux-amd64.tgz
+```
+
+With `wget`
+```bash
+wget -O <download-path/ollama-linux-amd64.tgz https://ollama.com/download/ollama-linux-amd64.tgz
+```
+
+2. Extracting the tar file to a path of our choice by replacing the placeholder `<install-path>` with our path. We suggest the provided `ollama` directory for testing purposes. Apply `sudo` as needed if we prefer following the [official documentation](https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install).
+
+```bash
+tar -C <install-path> -xzf ollama-linux-amd64.tgz
+```
+
+3. Setting the `OLLAMA_MODELS` environment variable to the path where we would like to store our models (placeholder `<model-path>`), then start the ollama web service to run as a background job in bash. Here's a reminder that we can add environment vars to our `.bashrc`.
+
+```bash
+OLLAMA_MODELS=<model-path> ; <install-path>/./bin/ollama serve &
+```
+
+4. Then, downloading the model of our choice (placeholder <model>, e. g. `llama3:70b` (48 GB VRAM) or `llama3.2:3b` (3 GB VRAM) ; only necessary once, as long as `OLLAMA_MODELS` is set to the correct `<model-path>`.
+
+```bash
+<install-path>/./bin/ollama pull <model>
+```
+
+5. Ollama is now ready to recieve requests to `<model>` by CHESS. We can also start a chat session with `ollama run <model>` to check that everything works. To stop the web service, get the job to the foreground with `fg`, then stop with `Ctrl` + `C`. To restart the web service, simply run (only) step 3 again. Remove/Uninstall with `rm <download-path>/ollama-linux-amd64.tgz`, `rm -r <install-path>/*` and `rm -r <model-path>/*`.
+
+## Setting up the venv
+
+1. We change to the `CHESS` directory with `cd CHESS`. There we create a python virtual environment named `venv` with the command (using the appropriate python version on our system):
+
+```bash
+python -m venv venv
+```
+
+2. We activate the environment with the following command, adding `(venv) ` as a prefix to our command prompt
+
+```bash
+source venv/bin/activate
+```
+
+3. We install the required python packages with:
+
+```bash
+pip install -r requirements.txt
+```
+
+4. We are ready to execute any further steps below. Once we're finished, we can deactivate the virtual environment (which removes the `(venv )` prefix) by running:
+
+```bash
+deactivate
+```
+
+## Configuring the preprocessing
+
+The CHESS framework uses the [langchain python package](https://python.langchain.com/docs/introduction/) to connect to a LLM via the API of a web service. The Ollama integration for langchain was added to the `requirements.txt` file as `langchain_ollama==0.1.3`  (version 0.1.3 because of its compatiblity with the existing requirements).
+
+The preprocessing calls the LLM for embedding of the database and column descriptions, thus the file [CHESS/src/database_utils/db_catalog/preprocessing.py](CHESS/src/database_utils/db_catalog/preprocessing.py) was edited, adding the import `from langchain_ollama import OllamaEmbeddings` and commenting out the existing `EMBEDDING_FUNCTION` to replace it with `EMBEDDING_FUNCTION = OllamaEmbeddings(model="llama3.2")`.
+
+1. To use a different model with Ollama for embedding, the Parameter `model=llama3.2` must be edited to `model=<model>`.
+
+2. Ensure that the Ollama web service is running (`OLLAMA_MODELS=<model-path> ; ollama/bin/./ollama serve &`)
+
+3. Run the preprocessing by changing into the `CHESS` directory with `cd CHESS`, assuming you are in the replication's repository root. Then run:
+
+```bash
+/run/./run_preprocess.sh
+```
+
+## Configuring the agents
+
+To configure the model for the agents, we need to add a new engine/model configuration to [CHESS/src/llm/engine_configs.py](CHESS/src/llm/engine_configs.py). For Llama3.2 with Ollama, we added the following configuration to the `ENGINE_CONFIGS`:
+
+```python
+    "meta-llama/llama3-2": {
+        "constructor": ChatOllama,
+        "params": {
+            "model": "llama3.2",
+            "temperature": 0,
+            "model_kwargs": {
+                "stop": [""],
+            },
+            "num_ctx": 32768
+        }
+    }
+```
+
+- The `constructor` will be the langchain constructor `ChatOllama` for API calls to the LLM.
+- The `params` are the constructor parameters
+- The `model` is the model used by Ollama
+- The `temperature` is the default model temperature. This gets overwritten by the config of the agents.
+- The `model_kwargs` are copied from the existing Llama config, including the `stop` entry.
+- The `num_ctx` is the context used by the model. Ollama defaults to a context size of 2048 tokens. We observed context sizes of about 15 000 tokens in the warnings from Ollama, Therefore, we set a context of about twice that. Note that Llama3.2 allows for a context size of up to 128 000, where as Llama3-70B only allows for a context size of 8192 tokens. Check with the model you would like to run with Ollama.
+- For any other parameters, check the [langchain_ollama documentation](https://python.langchain.com/api_reference/ollama/chat_models/langchain_ollama.chat_models.ChatOllama.html#langchain_ollama.chat_models.ChatOllama) on `ChatOllama` 
+
+1. Adding another model for Ollama will be configured similarly to the above config.
+
+To configure the agents, a `.yaml` configuration file and a shell script are needed. For testing purposes, the authors of the replication copied the [CHESS/run/configs/CHESS_IR_CG_UT.yaml](CHESS/run/configs/CHESS_IR_CG_UT.yaml) config file to [CHESS/run/configs/CHESS_IR_CG_UT_LLAMA3-2.yaml](CHESS/run/configs/CHESS_IR_CG_UT_LLAMA3-2.yaml) and the [CHESS/run/configs/CHESS_IR_SS_CG.yaml](CHESS/run/configs/CHESS_IR_SS_CG.yaml) config file to [CHESS/run/configs/CHESS_IR_SS_CG_LLAMA3-2.yaml](CHESS/run/configs/CHESS_IR_SS_CG_LLAMA3-2.yaml) and replaced every `engine` and `engine_name` config with the `meta-llama/llama3-2` model as configured above.
+
+2. Copying the appropriate `.yaml` file in `CHESS/run/configs` and setting all `engine` and `engine_name` parameters to the model of choice will configure the agents to one of the two workflows in the original CHESS paper, just with the model of choice.
+
+Similarly, the shell scripts to run the agents were copied for testing purposes by the authors of the replication, copying [CHESS/run/run_main_ir_cg_ut.sh](CHESS/run/run_main_ir_cg_ut.sh) to [CHESS/run/run_main_ig_cg_ut_llama3.2.sh](CHESS/run/run_main_ig_cg_ut_llama3.2.sh) and [CHESS/run/run_main_ir_ss_cg.sh](CHESS/run/run_main_ir_ss_cg.sh) to [CHESS/run/run_main_ir_ss_cg_llama3.2.sh](CHESS/run/run_main_ir_ss_cg_llama3.2.sh) in `CHESS/run`. The `config` variable was changed to the appropriate path of the agent configuration file:
+
+In [CHESS/run/run_main_ig_cg_ut_llama3.2.sh](CHESS/run/run_main_ig_cg_ut_llama3.2.sh), the line:
+
+```bash
+config="./run/config/CHESS_IR_CG_UT.yaml"
+```
+
+Was changed to the line:
+
+```bash
+config="./run/config/CHESS_IR_CG_UT_LLAMA3-2.yaml"
+```
+
+3. Copying a run script in the directory `CHESS/run` and adjusting the `config` variable to the appropriate config file created in step 2 makes the framework runnable with a custom configuration.
+
+In the information retriever agent (IR) there is another call to the `embed`-API that is not covered by the config in the previous steps. In the `retrieve_entity` tool, the in the file [CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py](CHESS/src/workflow/agents/information_retriever/tool_kit/retrieve_entity.py), the replication authors added the import `from langchain_ollama import OllamaEmbeddings` and the property `embedding_function` of class `RetrieveEntity` (line 34, `self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")`) was adapted to the `OllamaEmbeddings`: `self.embedding_function = OllamaEmbeddings(model="llama3.2")`
+
+4. Changing the model for the embedding calls of the `retrieve_entity` tool in the information retriever agent to the model of choice will ensure all API calls are directed to the appropriate LLMs.
+
+5. Run the shell script for your configuration from the `CHESS` directory, e. g. for the Llama3.2 testing config of the replication authors:
+
+```bash
+run/./run_main_ir_ss_cg_llama3.2.sh
+```
+
+## ...
--- a/ollama/.gitignore
+++ b/ollama/.gitignore
--- a/ollama/models/.gitignore
+++ b/ollama/models/.gitignore
No results found