fix bug that lead to missing questions in the filtering

4c0567ad · Adrien Klose · ea12638d · 4c0567ad
Commit 4c0567ad authored 10 months ago by Adrien Klose
--- a/ideas_practice/code/bioasq_extract_yes_no.ipynb
+++ b/ideas_practice/code/bioasq_extract_yes_no.ipynb
@@ -125,6 +125,7 @@
    "    writer.writerow(header_tsv)\n",
    "    for question in filtered_data:\n",
    "        id += 1\n",
+    "        written = False\n",
    "        if \"triples\" in question:\n",
    "            for triple in question[\"triples\"]:\n",
    "                # check for the filtering criteria\n",
@@ -140,7 +141,12 @@
    "                relation = base[-1].split(\"#\")[-1]  # some of the links include a # for the exact path that needs to be removed\n",
    "            \n",
    "                writer.writerow([question[\"id\"], id, triple[\"s\"], triple[\"p\"], triple[\"o\"], \"\", relation, \"\", \"\", question[\"body\"], question[\"exact_answer\"]])\n",
+    "                written = True\n",
    "        else:\n",
+    "            writer.writerow([question[\"id\"], id, \"\", \"\", \"\", \"\", \"\", \"\", \"\", question[\"body\"], question[\"exact_answer\"]])\n",
+    "            written = True\n",
+    "        # if all triples are filtered the question would not be written without this\n",
+    "        if not written:\n",
    "            writer.writerow([question[\"id\"], id, \"\", \"\", \"\", \"\", \"\", \"\", \"\", question[\"body\"], question[\"exact_answer\"]])"
   ]
  },

 %% Cell type:code id:20bdf0bb-ec1d-4cde-bf92-29a2ee50d344 tags:

 ``` python
 import json
 import csv
 import re
 ```

 %% Cell type:code id:51b4d7cb-9235-443d-b588-724d3cf06cc3 tags:

 ``` python
 with open("../data/training12b_new.json", "r") as f:
  train12b = json.load(f)["questions"]
 print(len(train12b))
 ```

 %% Output

    5046

 %% Cell type:code id:4a55b8a8-3f2b-4c95-b01a-c68e426081df tags:

 ``` python
 ### filter out the yesno questions
 filtered_data = []
 yes_questions = []
 no_questions = []
 for question in train12b:
    if question["type"] == "yesno":
        if question["exact_answer"] == "yes":
            yes_questions.append(question)
        else:
            no_questions.append(question)
 filtered_data = no_questions+yes_questions
 #print(filtered_data[55])
 print(len(no_questions))
 print(len(yes_questions))
 ```

 %% Output

    354
    1003

 %% Cell type:code id:405e5736-2980-4594-be5f-8629f0e43960 tags:

 ``` python
 ## test csv writer
 #with open("test_csv.csv", 'w') as csvfile:
    # creating a csv writer object
 #    csvwriter = csv.writer(csvfile)
 #
 #    # writing the fields
 #    csvwriter.writerow(["1","qweqwr,qwrqwr,,qwrqwr,,2","3"])
 ```

 %% Cell type:code id:7c283cca-ae60-44ec-a1af-efdbba5b65c4 tags:

 ``` python
 header_tsv = ["bioasq_id", "id", "s_bioasq", "p_bioasq", "o_bioasq", "question", "ground_truth"]
 # the ideal answer might be interesting if we test more than yes and no

 # write yes and no questions to csv
 id = 0
 yes = 0
 no = 0
 with open("../data/bioasq_yesno.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header_tsv)
    for question in filtered_data:
        id += 1
        if "triples" in question:
            for triple in question["triples"]:
                writer.writerow([question["id"], id, triple["s"], triple["p"], triple["o"], question["body"], question["exact_answer"]])
            # find the number of yes vs no answers
        else:
            writer.writerow([question["id"], id, "", "", "", question["body"], question["exact_answer"]])
 ```

 %% Cell type:code id:8f3197fc-f900-4857-bcc6-b7cd5f1a02c1 tags:

 ``` python
 header_tsv = ["bioasq_id", "id", "s_bioasq", "p_bioasq", "o_bioasq", "s_raw", "p_raw", "o_raw", "nl_resolved", "question", "ground_truth"]

 # write filtered data to csv, filtering described in yesno_filtering.txt
 id = 0
 yes = 0
 no = 0
 with open("../data/bioasq_yesno_filtered.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header_tsv)
    for question in filtered_data:
        id += 1
+        written = False
        if "triples" in question:
            for triple in question["triples"]:
                # check for the filtering criteria
                filter_general = "fu-berlin|linkedlifedata|data.linkedct.org|purl.uniprot.org.*(citations|pubmed|tissues|intact|/go/)"
                match_s = re.search(filter_general, triple["s"])
                match_o = re.search(filter_general, triple["o"])
                if (match_s and not(re.search("umls", triple["s"]))):
                    continue
                if (match_o and not(re.search("umls", triple["o"]))):
                    continue
                #resolve the link for the relationship in p
                base = triple["p"].split("/")
                relation = base[-1].split("#")[-1]  # some of the links include a # for the exact path that needs to be removed

                writer.writerow([question["id"], id, triple["s"], triple["p"], triple["o"], "", relation, "", "", question["body"], question["exact_answer"]])
+                written = True
        else:
            writer.writerow([question["id"], id, "", "", "", "", "", "", "", question["body"], question["exact_answer"]])
+            written = True
+        # if all triples are filtered the question would not be written without this
+        if not written:
+            writer.writerow([question["id"], id, "", "", "", "", "", "", "", question["body"], question["exact_answer"]])
 ```

 %% Cell type:code id:ae5d0729-b36f-49c9-b7de-0d00a70659b0 tags:

 ``` python
 ```