Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
thesis-klose-master
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Adrien Klose
thesis-klose-master
Commits
4c0567ad
Commit
4c0567ad
authored
10 months ago
by
Adrien Klose
Browse files
Options
Downloads
Patches
Plain Diff
fix bug that lead to missing questions in the filtering
parent
ea12638d
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
ideas_practice/code/bioasq_extract_yes_no.ipynb
+6
-0
6 additions, 0 deletions
ideas_practice/code/bioasq_extract_yes_no.ipynb
with
6 additions
and
0 deletions
ideas_practice/code/bioasq_extract_yes_no.ipynb
+
6
−
0
View file @
4c0567ad
...
...
@@ -125,6 +125,7 @@
" writer.writerow(header_tsv)\n",
" for question in filtered_data:\n",
" id += 1\n",
" written = False\n",
" if \"triples\" in question:\n",
" for triple in question[\"triples\"]:\n",
" # check for the filtering criteria\n",
...
...
@@ -140,7 +141,12 @@
" relation = base[-1].split(\"#\")[-1] # some of the links include a # for the exact path that needs to be removed\n",
" \n",
" writer.writerow([question[\"id\"], id, triple[\"s\"], triple[\"p\"], triple[\"o\"], \"\", relation, \"\", \"\", question[\"body\"], question[\"exact_answer\"]])\n",
" written = True\n",
" else:\n",
" writer.writerow([question[\"id\"], id, \"\", \"\", \"\", \"\", \"\", \"\", \"\", question[\"body\"], question[\"exact_answer\"]])\n",
" written = True\n",
" # if all triples are filtered the question would not be written without this\n",
" if not written:\n",
" writer.writerow([question[\"id\"], id, \"\", \"\", \"\", \"\", \"\", \"\", \"\", question[\"body\"], question[\"exact_answer\"]])"
]
},
...
...
%% Cell type:code id:20bdf0bb-ec1d-4cde-bf92-29a2ee50d344 tags:
```
python
import
json
import
csv
import
re
```
%% Cell type:code id:51b4d7cb-9235-443d-b588-724d3cf06cc3 tags:
```
python
with
open
(
"
../data/training12b_new.json
"
,
"
r
"
)
as
f
:
train12b
=
json
.
load
(
f
)[
"
questions
"
]
print
(
len
(
train12b
))
```
%% Output
5046
%% Cell type:code id:4a55b8a8-3f2b-4c95-b01a-c68e426081df tags:
```
python
### filter out the yesno questions
filtered_data
=
[]
yes_questions
=
[]
no_questions
=
[]
for
question
in
train12b
:
if
question
[
"
type
"
]
==
"
yesno
"
:
if
question
[
"
exact_answer
"
]
==
"
yes
"
:
yes_questions
.
append
(
question
)
else
:
no_questions
.
append
(
question
)
filtered_data
=
no_questions
+
yes_questions
#print(filtered_data[55])
print
(
len
(
no_questions
))
print
(
len
(
yes_questions
))
```
%% Output
354
1003
%% Cell type:code id:405e5736-2980-4594-be5f-8629f0e43960 tags:
```
python
## test csv writer
#with open("test_csv.csv", 'w') as csvfile:
# creating a csv writer object
# csvwriter = csv.writer(csvfile)
#
# # writing the fields
# csvwriter.writerow(["1","qweqwr,qwrqwr,,qwrqwr,,2","3"])
```
%% Cell type:code id:7c283cca-ae60-44ec-a1af-efdbba5b65c4 tags:
```
python
header_tsv
=
[
"
bioasq_id
"
,
"
id
"
,
"
s_bioasq
"
,
"
p_bioasq
"
,
"
o_bioasq
"
,
"
question
"
,
"
ground_truth
"
]
# the ideal answer might be interesting if we test more than yes and no
# write yes and no questions to csv
id
=
0
yes
=
0
no
=
0
with
open
(
"
../data/bioasq_yesno.csv
"
,
'
w
'
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
.
writerow
(
header_tsv
)
for
question
in
filtered_data
:
id
+=
1
if
"
triples
"
in
question
:
for
triple
in
question
[
"
triples
"
]:
writer
.
writerow
([
question
[
"
id
"
],
id
,
triple
[
"
s
"
],
triple
[
"
p
"
],
triple
[
"
o
"
],
question
[
"
body
"
],
question
[
"
exact_answer
"
]])
# find the number of yes vs no answers
else
:
writer
.
writerow
([
question
[
"
id
"
],
id
,
""
,
""
,
""
,
question
[
"
body
"
],
question
[
"
exact_answer
"
]])
```
%% Cell type:code id:8f3197fc-f900-4857-bcc6-b7cd5f1a02c1 tags:
```
python
header_tsv
=
[
"
bioasq_id
"
,
"
id
"
,
"
s_bioasq
"
,
"
p_bioasq
"
,
"
o_bioasq
"
,
"
s_raw
"
,
"
p_raw
"
,
"
o_raw
"
,
"
nl_resolved
"
,
"
question
"
,
"
ground_truth
"
]
# write filtered data to csv, filtering described in yesno_filtering.txt
id
=
0
yes
=
0
no
=
0
with
open
(
"
../data/bioasq_yesno_filtered.csv
"
,
'
w
'
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
.
writerow
(
header_tsv
)
for
question
in
filtered_data
:
id
+=
1
written
=
False
if
"
triples
"
in
question
:
for
triple
in
question
[
"
triples
"
]:
# check for the filtering criteria
filter_general
=
"
fu-berlin|linkedlifedata|data.linkedct.org|purl.uniprot.org.*(citations|pubmed|tissues|intact|/go/)
"
match_s
=
re
.
search
(
filter_general
,
triple
[
"
s
"
])
match_o
=
re
.
search
(
filter_general
,
triple
[
"
o
"
])
if
(
match_s
and
not
(
re
.
search
(
"
umls
"
,
triple
[
"
s
"
]))):
continue
if
(
match_o
and
not
(
re
.
search
(
"
umls
"
,
triple
[
"
o
"
]))):
continue
#resolve the link for the relationship in p
base
=
triple
[
"
p
"
].
split
(
"
/
"
)
relation
=
base
[
-
1
].
split
(
"
#
"
)[
-
1
]
# some of the links include a # for the exact path that needs to be removed
writer
.
writerow
([
question
[
"
id
"
],
id
,
triple
[
"
s
"
],
triple
[
"
p
"
],
triple
[
"
o
"
],
""
,
relation
,
""
,
""
,
question
[
"
body
"
],
question
[
"
exact_answer
"
]])
written
=
True
else
:
writer
.
writerow
([
question
[
"
id
"
],
id
,
""
,
""
,
""
,
""
,
""
,
""
,
""
,
question
[
"
body
"
],
question
[
"
exact_answer
"
]])
written
=
True
# if all triples are filtered the question would not be written without this
if
not
written
:
writer
.
writerow
([
question
[
"
id
"
],
id
,
""
,
""
,
""
,
""
,
""
,
""
,
""
,
question
[
"
body
"
],
question
[
"
exact_answer
"
]])
```
%% Cell type:code id:ae5d0729-b36f-49c9-b7de-0d00a70659b0 tags:
```
python
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment