-
Notifications
You must be signed in to change notification settings - Fork 101
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7c67746
commit ab607d3
Showing
720 changed files
with
121,668 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# Data curation | ||
# Data | ||
|
||
TODO | ||
The scripts inside of datasets_processing_scripts are the ones we used to create all the datasets used for training smolvlm |
51 changes: 51 additions & 0 deletions
51
...cessing_scripts/01_tar_datasets_with_jpeg/python_scripts/01_convert_coco_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--shard_dir_path", type=Path, required=True) | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int, required=True) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()] | ||
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS | ||
|
||
export_dataset_shard_idx_to_tar( | ||
hf_datasets_paths=shard_1_dirs, | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
shard_idx=args.shard_idx, | ||
min_num_shards=args.min_num_shards, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
51 changes: 51 additions & 0 deletions
51
...ocessing_scripts/01_tar_datasets_with_jpeg/python_scripts/02_convert_cm4_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--shard_dir_path", type=Path, required=True) | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int, required=True) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()] | ||
ds_type = DatasetTypes.WEB_DOCUMENTS | ||
|
||
export_dataset_shard_idx_to_tar( | ||
hf_datasets_paths=shard_1_dirs, | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
shard_idx=args.shard_idx, | ||
min_num_shards=args.min_num_shards, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
56 changes: 56 additions & 0 deletions
56
...essing_scripts/01_tar_datasets_with_jpeg/python_scripts/03_convert_laoin_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--shard_dir_path", type=Path, required=True) | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--shard_idx", type=int, required=True) | ||
parser.add_argument("--laoin_shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
parser.add_argument("--save_shard_idx", type=int, default=None) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
shard_1_dirs = [ | ||
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.laoin_shard_idx) | ||
] | ||
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS | ||
|
||
export_dataset_shard_idx_to_tar( | ||
hf_datasets_paths=shard_1_dirs, | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
shard_idx=args.shard_idx, | ||
min_num_shards=args.min_num_shards, | ||
save_shard_idx=args.save_shard_idx, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
56 changes: 56 additions & 0 deletions
56
...ocessing_scripts/01_tar_datasets_with_jpeg/python_scripts/04_convert_cm4_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--shard_dir_path", type=Path, required=True) | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--raw_cm4_shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
parser.add_argument("--save_shard_prefix", type=str, default=None) | ||
parser.add_argument("--s3_uri", type=str, required=True) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
shard_1_dirs = [ | ||
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.raw_cm4_shard_idx) | ||
] | ||
ds_type = DatasetTypes.WEB_DOCUMENTS | ||
|
||
export_dataset_all_shard_idx_to_tar( | ||
hf_datasets_paths=shard_1_dirs, | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
min_num_shards=args.min_num_shards, | ||
save_shard_prefix=args.save_shard_prefix, | ||
s3_uri=args.s3_uri, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
56 changes: 56 additions & 0 deletions
56
...scripts/01_tar_datasets_with_jpeg/python_scripts/05_convert_scaled_laion_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--shard_dir_path", type=Path, required=True) | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--raw_laion_shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
parser.add_argument("--save_shard_prefix", type=str, default=None) | ||
parser.add_argument("--s3_uri", type=str, required=True) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
shard_1_dirs = [ | ||
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.raw_laion_shard_idx) | ||
] | ||
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS | ||
|
||
export_dataset_all_shard_idx_to_tar( | ||
hf_datasets_paths=shard_1_dirs, | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
min_num_shards=args.min_num_shards, | ||
save_shard_prefix=args.save_shard_prefix, | ||
s3_uri=args.s3_uri, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
49 changes: 49 additions & 0 deletions
49
...ocessing_scripts/01_tar_datasets_with_jpeg/python_scripts/06_convert_lrv_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int, required=True) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
ds_type = DatasetTypes.LRV_PAIRS | ||
|
||
export_dataset_shard_idx_to_tar( | ||
hf_datasets_paths=["VictorSanh/LrvInstruction:train"], | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
shard_idx=args.shard_idx, | ||
min_num_shards=args.min_num_shards, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
49 changes: 49 additions & 0 deletions
49
...essing_scripts/01_tar_datasets_with_jpeg/python_scripts/07_convert_llava_per_shard_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
from datasets import set_caching_enabled | ||
|
||
from m4.training.types import DatasetTypes | ||
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar | ||
|
||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
set_caching_enabled(False) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--saving_dir", type=Path, required=True) | ||
parser.add_argument("--num_examples_per_shard", type=int, required=True) | ||
parser.add_argument("--num_proc", type=int, required=True) | ||
parser.add_argument("--shard_idx", type=int, required=True) | ||
parser.add_argument("--min_num_shards", type=int) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def main(args): | ||
ds_type = DatasetTypes.LLaVA | ||
|
||
export_dataset_shard_idx_to_tar( | ||
hf_datasets_paths=["HuggingFaceM4/LLaVA-Instruct-150K:train"], | ||
saving_dir=args.saving_dir, | ||
ds_type=ds_type, | ||
num_examples_per_shard=args.num_examples_per_shard, | ||
num_proc=args.num_proc, | ||
shard_idx=args.shard_idx, | ||
min_num_shards=args.min_num_shards, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_args() | ||
main(args) |
Oops, something went wrong.