Skip to content
Snippets Groups Projects
Commit 4c0567ad authored by Adrien Klose's avatar Adrien Klose
Browse files

fix bug that lead to missing questions in the filtering

parent ea12638d
Branches
No related tags found
No related merge requests found
%% Cell type:code id:20bdf0bb-ec1d-4cde-bf92-29a2ee50d344 tags:
``` python
import json
import csv
import re
```
%% Cell type:code id:51b4d7cb-9235-443d-b588-724d3cf06cc3 tags:
``` python
with open("../data/training12b_new.json", "r") as f:
train12b = json.load(f)["questions"]
print(len(train12b))
```
%% Output
5046
%% Cell type:code id:4a55b8a8-3f2b-4c95-b01a-c68e426081df tags:
``` python
### filter out the yesno questions
filtered_data = []
yes_questions = []
no_questions = []
for question in train12b:
if question["type"] == "yesno":
if question["exact_answer"] == "yes":
yes_questions.append(question)
else:
no_questions.append(question)
filtered_data = no_questions+yes_questions
#print(filtered_data[55])
print(len(no_questions))
print(len(yes_questions))
```
%% Output
354
1003
%% Cell type:code id:405e5736-2980-4594-be5f-8629f0e43960 tags:
``` python
## test csv writer
#with open("test_csv.csv", 'w') as csvfile:
# creating a csv writer object
# csvwriter = csv.writer(csvfile)
#
# # writing the fields
# csvwriter.writerow(["1","qweqwr,qwrqwr,,qwrqwr,,2","3"])
```
%% Cell type:code id:7c283cca-ae60-44ec-a1af-efdbba5b65c4 tags:
``` python
header_tsv = ["bioasq_id", "id", "s_bioasq", "p_bioasq", "o_bioasq", "question", "ground_truth"]
# the ideal answer might be interesting if we test more than yes and no
# write yes and no questions to csv
id = 0
yes = 0
no = 0
with open("../data/bioasq_yesno.csv", 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header_tsv)
for question in filtered_data:
id += 1
if "triples" in question:
for triple in question["triples"]:
writer.writerow([question["id"], id, triple["s"], triple["p"], triple["o"], question["body"], question["exact_answer"]])
# find the number of yes vs no answers
else:
writer.writerow([question["id"], id, "", "", "", question["body"], question["exact_answer"]])
```
%% Cell type:code id:8f3197fc-f900-4857-bcc6-b7cd5f1a02c1 tags:
``` python
header_tsv = ["bioasq_id", "id", "s_bioasq", "p_bioasq", "o_bioasq", "s_raw", "p_raw", "o_raw", "nl_resolved", "question", "ground_truth"]
# write filtered data to csv, filtering described in yesno_filtering.txt
id = 0
yes = 0
no = 0
with open("../data/bioasq_yesno_filtered.csv", 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header_tsv)
for question in filtered_data:
id += 1
written = False
if "triples" in question:
for triple in question["triples"]:
# check for the filtering criteria
filter_general = "fu-berlin|linkedlifedata|data.linkedct.org|purl.uniprot.org.*(citations|pubmed|tissues|intact|/go/)"
match_s = re.search(filter_general, triple["s"])
match_o = re.search(filter_general, triple["o"])
if (match_s and not(re.search("umls", triple["s"]))):
continue
if (match_o and not(re.search("umls", triple["o"]))):
continue
#resolve the link for the relationship in p
base = triple["p"].split("/")
relation = base[-1].split("#")[-1] # some of the links include a # for the exact path that needs to be removed
writer.writerow([question["id"], id, triple["s"], triple["p"], triple["o"], "", relation, "", "", question["body"], question["exact_answer"]])
written = True
else:
writer.writerow([question["id"], id, "", "", "", "", "", "", "", question["body"], question["exact_answer"]])
written = True
# if all triples are filtered the question would not be written without this
if not written:
writer.writerow([question["id"], id, "", "", "", "", "", "", "", question["body"], question["exact_answer"]])
```
%% Cell type:code id:ae5d0729-b36f-49c9-b7de-0d00a70659b0 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment