Skip to content
Snippets Groups Projects
Commit 75aa6392 authored by codingma's avatar codingma
Browse files

support for previewing custom dataset in directory format

parent b3ac14ff
Branches
No related tags found
No related merge requests found
......@@ -28,30 +28,44 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
dataset_info = json.load(f)
except Exception:
return gr.Button(interactive=False)
local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
if (
len(dataset) > 0
and "file_name" in dataset_info[dataset[0]]
and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]))
len(dataset) > 0
and "file_name" in dataset_info[dataset[0]]
and (os.path.isfile(local_path)
or (os.path.isdir(local_path)) and len(os.listdir(local_path)) != 0)
):
return gr.Button(interactive=True)
else:
return gr.Button(interactive=False)
def load_single_data(data_file_path):
with open(os.path.join(data_file_path), "r", encoding="utf-8") as f:
if data_file_path.endswith(".json"):
data = json.load(f)
elif data_file_path.endswith(".jsonl"):
data = [json.loads(line) for line in f]
else:
data = [line for line in f] # noqa: C416
return data
def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
dataset_info = json.load(f)
data_file: str = dataset_info[dataset[0]]["file_name"]
with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f:
if data_file.endswith(".json"):
data = json.load(f)
elif data_file.endswith(".jsonl"):
data = [json.loads(line) for line in f]
else:
data = [line for line in f] # noqa: C416
return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
local_path = os.path.join(dataset_dir, data_file)
if os.path.isdir(local_path):
data = []
for file_name in os.listdir(local_path):
data.extend(load_single_data(os.path.join(local_path, file_name)))
else:
data = load_single_data(local_path)
return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment