Skip to content

Commit

Permalink
add code from m4
Browse files Browse the repository at this point in the history
  • Loading branch information
andimarafioti committed Jan 29, 2025
1 parent 7c67746 commit ab607d3
Show file tree
Hide file tree
Showing 720 changed files with 121,668 additions and 2 deletions.
4 changes: 2 additions & 2 deletions vision/data/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Data curation
# Data

TODO
The scripts inside of datasets_processing_scripts are the ones we used to create all the datasets used for training smolvlm
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--shard_dir_path", type=Path, required=True)
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int, required=True)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
args = parser.parse_args()
return args


def main(args):
shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()]
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS

export_dataset_shard_idx_to_tar(
hf_datasets_paths=shard_1_dirs,
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
shard_idx=args.shard_idx,
min_num_shards=args.min_num_shards,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--shard_dir_path", type=Path, required=True)
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int, required=True)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
args = parser.parse_args()
return args


def main(args):
shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()]
ds_type = DatasetTypes.WEB_DOCUMENTS

export_dataset_shard_idx_to_tar(
hf_datasets_paths=shard_1_dirs,
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
shard_idx=args.shard_idx,
min_num_shards=args.min_num_shards,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--shard_dir_path", type=Path, required=True)
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--shard_idx", type=int, required=True)
parser.add_argument("--laoin_shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
parser.add_argument("--save_shard_idx", type=int, default=None)
args = parser.parse_args()
return args


def main(args):
shard_1_dirs = [
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.laoin_shard_idx)
]
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS

export_dataset_shard_idx_to_tar(
hf_datasets_paths=shard_1_dirs,
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
shard_idx=args.shard_idx,
min_num_shards=args.min_num_shards,
save_shard_idx=args.save_shard_idx,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--shard_dir_path", type=Path, required=True)
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--raw_cm4_shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
parser.add_argument("--save_shard_prefix", type=str, default=None)
parser.add_argument("--s3_uri", type=str, required=True)
args = parser.parse_args()
return args


def main(args):
shard_1_dirs = [
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.raw_cm4_shard_idx)
]
ds_type = DatasetTypes.WEB_DOCUMENTS

export_dataset_all_shard_idx_to_tar(
hf_datasets_paths=shard_1_dirs,
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
min_num_shards=args.min_num_shards,
save_shard_prefix=args.save_shard_prefix,
s3_uri=args.s3_uri,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--shard_dir_path", type=Path, required=True)
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--raw_laion_shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
parser.add_argument("--save_shard_prefix", type=str, default=None)
parser.add_argument("--s3_uri", type=str, required=True)
args = parser.parse_args()
return args


def main(args):
shard_1_dirs = [
shard_dir for shard_dir in args.shard_dir_path.iterdir() if shard_dir.name == str(args.raw_laion_shard_idx)
]
ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS

export_dataset_all_shard_idx_to_tar(
hf_datasets_paths=shard_1_dirs,
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
min_num_shards=args.min_num_shards,
save_shard_prefix=args.save_shard_prefix,
s3_uri=args.s3_uri,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int, required=True)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
args = parser.parse_args()
return args


def main(args):
ds_type = DatasetTypes.LRV_PAIRS

export_dataset_shard_idx_to_tar(
hf_datasets_paths=["VictorSanh/LrvInstruction:train"],
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
shard_idx=args.shard_idx,
min_num_shards=args.min_num_shards,
)


if __name__ == "__main__":
args = get_args()
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import argparse
import logging
from pathlib import Path

from datasets import set_caching_enabled

from m4.training.types import DatasetTypes
from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

set_caching_enabled(False)


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--saving_dir", type=Path, required=True)
parser.add_argument("--num_examples_per_shard", type=int, required=True)
parser.add_argument("--num_proc", type=int, required=True)
parser.add_argument("--shard_idx", type=int, required=True)
parser.add_argument("--min_num_shards", type=int)
args = parser.parse_args()
return args


def main(args):
ds_type = DatasetTypes.LLaVA

export_dataset_shard_idx_to_tar(
hf_datasets_paths=["HuggingFaceM4/LLaVA-Instruct-150K:train"],
saving_dir=args.saving_dir,
ds_type=ds_type,
num_examples_per_shard=args.num_examples_per_shard,
num_proc=args.num_proc,
shard_idx=args.shard_idx,
min_num_shards=args.min_num_shards,
)


if __name__ == "__main__":
args = get_args()
main(args)
Loading

0 comments on commit ab607d3

Please sign in to comment.