Skip to content
Snippets Groups Projects
Commit dea1b2ec authored by Donatus Herre's avatar Donatus Herre
Browse files

src/extract.py updated

parent c6714f72
Branches
No related tags found
No related merge requests found
src/extract.py 100644 → 100755
......@@ -8,7 +8,7 @@ Creator: D. Herre
GitLab: dikon/vkg-json
Created: 2019-07-08
Last Modified: 2020-04-28
Last Modified: 2020-04-29
Usage:
......@@ -56,7 +56,7 @@ re_desc = re.compile("(^\\[?\\d\\d?\\]?\\sBl\\.|^\\d+,|^\\d+\\sS\\.|^[A-Z]+\\sS\
"|^[A-Z]+,\\s\\d+,)(?!\\.,\\sTeil)")
def read_raw(path='../data/verlagsbibliographie.txt'):
def read_raw(path='./data/verlagsbibliographie.txt'):
"""
read the plain text file
"""
......@@ -275,7 +275,7 @@ def write_list_of_list(filepath, listoflists, title='LIST OF LIST', extra=True):
########################
def write_count_total(path='./output/count_total.txt', total=0):
def write_count_total(path='./src/output/count_total.txt', total=0):
"""
save total number of entries to output/count_total.txt
"""
......@@ -284,7 +284,7 @@ def write_count_total(path='./output/count_total.txt', total=0):
" ENTRIES!")
def write_count_years(path='./output/count_years.txt', entries={}, tab=' '):
def write_count_years(path='./src/output/count_years.txt', entries={}, tab=' '):
"""
save number of entries per year to output/count_years.txt
"""
......@@ -294,7 +294,7 @@ def write_count_years(path='./output/count_years.txt', entries={}, tab=' '):
f.write(str(year) + ":" + tab + str(len(entries[year])) + "\n")
def write_count_refs(path='./output/count_refs.txt', entries={}, tab=' '):
def write_count_refs(path='./src/output/count_refs.txt', entries={}, tab=' '):
"""
save total number of references to output/count_references.txt
(volumes from multi volumes count as 1)
......@@ -315,7 +315,7 @@ def write_count_refs(path='./output/count_refs.txt', entries={}, tab=' '):
print("found", total, "references!")
def write_count_lines(path='./output/count_lines.txt', len_entries={}, tab=' '):
def write_count_lines(path='./src/output/count_lines.txt', len_entries={}, tab=' '):
"""
save count of entries with lines of length x to output/count_lines.txt
"""
......@@ -328,7 +328,7 @@ def write_count_lines(path='./output/count_lines.txt', len_entries={}, tab='
str(len(len_entries[len_entry])) + "\n")
def write_lines_x_y(out_dir='./output/', len_entries={}, tab=' '):
def write_lines_x_y(out_dir='./src/output/', len_entries={}, tab=' '):
"""
save content of line x from entries with length y to output/lines_x.txt
"""
......@@ -383,27 +383,27 @@ def write_total_elements(entries):
save elements extracted from entries to seperate files
"""
years = [str(entry['year']) for entry in entries]
write_list('output/total_years.txt', years, title='TOTAL YEARS')
write_list('./src/output/total_years.txt', years, title='TOTAL YEARS')
counter = [entry['counter'] for entry in entries]
write_list('output/total_counters.txt', counter, title='TOTAL COUNTERS')
write_list('./src/output/total_counters.txt', counter, title='TOTAL COUNTERS')
creators = [entry['creator'] for entry in entries]
write_list('output/total_creators.txt', creators, title='TOTAL CREATORS')
write_list('./src/output/total_creators.txt', creators, title='TOTAL CREATORS')
titles = [entry['title'] for entry in entries]
write_list('output/total_titles.txt', titles, title='TOTAL TITLES')
write_list('./src/output/total_titles.txt', titles, title='TOTAL TITLES')
print = [entry['print'] for entry in entries if 'print' in entry]
write_list('output/total_prints.txt', print, title='TOTAL PRINTS')
write_list('./src/output/total_prints.txt', print, title='TOTAL PRINTS')
print_extras = [entry['print-extra'] for entry in entries if 'print-extra' in entry]
write_list('output/total_print-extras.txt', print_extras, title='TOTAL PRINT EXTRAS')
write_list('./src/output/total_print-extras.txt', print_extras, title='TOTAL PRINT EXTRAS')
description = [entry['description'] for entry in entries if 'description' in entry]
write_list('output/total_descriptions.txt', description, title='TOTAL DESCRIPTIONS')
write_list('./src/output/total_descriptions.txt', description, title='TOTAL DESCRIPTIONS')
miscs = [entry['misc'] for entry in entries if 'misc' in entry]
write_list_of_list('output/total_misc.txt', miscs, title='TOTAL MISCELLANEOUS', extra=False)
write_list_of_list('./src/output/total_misc.txt', miscs, title='TOTAL MISCELLANEOUS', extra=False)
parts = [entry['parts'] for entry in entries if 'parts' in entry]
write_list_of_list('output/total_parts.txt', parts, title='TOTAL PARTS (DESCRIPTION)')
write_list_of_list('./src/output/total_parts.txt', parts, title='TOTAL PARTS (DESCRIPTION)')
parts_print = [entry['parts-print'] for entry in entries if 'parts-print' in entry]
write_list('output/total_parts-print.txt', parts_print, title='TOTAL PARTS (PRINT)')
write_list('./src/output/total_parts-print.txt', parts_print, title='TOTAL PARTS (PRINT)')
parts_print_dates = [entry['parts-print-dates'] for entry in entries if 'parts-print-dates' in entry]
write_list('output/total_parts-print-dates.txt', parts_print_dates, title='TOTAL PARTS (DATES)')
write_list('./src/output/total_parts-print-dates.txt', parts_print_dates, title='TOTAL PARTS (DATES)')
def write_creator_roles(entries):
......@@ -412,28 +412,28 @@ def write_creator_roles(entries):
"""
creators = [entry['creator'] for entry in entries]
creators_roles = [creator for creator in creators if "(" in creator]
write_list('output/total_creators_roles.txt', creators_roles, title="CREATOR WITH ROLE")
write_list('./src/output/total_creators_roles.txt', creators_roles, title="CREATOR WITH ROLE")
creators_editors = [creator for creator in creators_roles if "(Hg.)" in creator or "(Hg)" in creator]
write_list('output/total_creators_roles_editors.txt', creators_editors,\
write_list('./src/output/total_creators_roles_editors.txt', creators_editors,\
title="ROLE OF CREATOR: EDITOR")
creator_translators = [creator for creator in creators_roles if "(Übers.)" in creator]
write_list('output/total_creators_roles_translators.txt', creator_translators,\
write_list('./src/output/total_creators_roles_translators.txt', creator_translators,\
title="ROLE OF CREATOR: TRANSLATOR")
creator_contributors = [creator for creator in creators_roles if "(Beitr.)" in creator]
write_list('output/total_creators_roles_contributors.txt', creator_contributors,\
write_list('./src/output/total_creators_roles_contributors.txt', creator_contributors,\
title="ROLE OF CREATOR: CONTRIBUTOR")
other = [creator for creator in creators_roles if creator not in creators_editors and\
creator not in creator_translators and creator not in creator_contributors]
write_list('output/total_creators_roles_special.txt', other,\
write_list('./src/output/total_creators_roles_special.txt', other,\
title="ROLE OF CREATOR: SPECIAL")
creators_authors = [creator for creator in creators if creator not in creators_roles]
write_list('output/total_creators_roles_authors.txt', creators_authors,\
write_list('./src/output/total_creators_roles_authors.txt', creators_authors,\
title="ROLE OF CREATOR: AUTHOR")
creators_authors_special = [creator for creator in creators_authors if "," not in creator]
write_list('output/total_creators_roles_authors_special.txt', creators_authors_special,\
write_list('./src/output/total_creators_roles_authors_special.txt', creators_authors_special,\
title="ROLE OF CREATOR: AUTHOR (SPECIAL FORMAT)")
creators_authors_normal = [creator for creator in creators_authors if creator not in creators_authors_special]
write_list('output/total_creators_roles_authors_normal.txt', creators_authors_normal,\
write_list('./src/output/total_creators_roles_authors_normal.txt', creators_authors_normal,\
title="ROLE OF CREATOR: AUTHOR (NORMAL FORMAT)")
......@@ -450,7 +450,7 @@ def extract_entries(output=True):
len_entries = length_entries(entries, output=output)
entries = structure_entries(entries)
if output:
write_json('../data/verlagsbibliographie.json', entries)
write_json('./data/verlagsbibliographie.json', entries)
write_total_elements(entries)
write_creator_roles(entries)
return entries
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment