Skip to content

Commit

Permalink
Merge branch 'main' into feli1/featurestoretest
Browse files Browse the repository at this point in the history
  • Loading branch information
fredms authored Sep 7, 2023
2 parents 8324b18 + 4c1c62f commit a5ada66
Show file tree
Hide file tree
Showing 3 changed files with 828 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# import library to parse command line arguments
import argparse, os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
# add an argument to specify a dataset name to download
parser.add_argument(
"--dataset_subset", type=str, default="split", help="dataset subset name"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="data",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)


# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading

0 comments on commit a5ada66

Please sign in to comment.