diff --git a/clarifai/datasets/upload/examples/image_classification/cifar10/dataset.py b/clarifai/datasets/upload/examples/image_classification/cifar10/dataset.py index 4d308d68..457a6a4f 100644 --- a/clarifai/datasets/upload/examples/image_classification/cifar10/dataset.py +++ b/clarifai/datasets/upload/examples/image_classification/cifar10/dataset.py @@ -36,7 +36,11 @@ def __getitem__(self, index): return VisualClassificationFeatures( image_path=os.path.join(os.path.dirname(__file__), item[0]), label=item[1], - id=os.path.basename(item[0]).split(".")[0]) + id=os.path.basename(item[0]).split(".")[0], + metadata={ + "split": self.split, + "image_path": item[0] + }) def __len__(self): return len(self.data) diff --git a/clarifai/datasets/upload/examples/text_classification/imdb_dataset/dataset.py b/clarifai/datasets/upload/examples/text_classification/imdb_dataset/dataset.py index cad69d31..bfaae2a6 100644 --- a/clarifai/datasets/upload/examples/text_classification/imdb_dataset/dataset.py +++ b/clarifai/datasets/upload/examples/text_classification/imdb_dataset/dataset.py @@ -27,7 +27,12 @@ def load_data(self): reader = csv.reader(_file) next(reader, None) # skip header for review in reader: - self.data.append({"text": review[0], "labels": review[1], "id": None}) + self.data.append({ + "text": review[0], + "labels": review[1], + "id": None, + "metadata": dict(split=self.split) + }) def __getitem__(self, idx): item = self.data[idx] diff --git a/clarifai/datasets/upload/features.py b/clarifai/datasets/upload/features.py index 77adf389..0e93f695 100644 --- a/clarifai/datasets/upload/features.py +++ b/clarifai/datasets/upload/features.py @@ -9,6 +9,7 @@ class TextFeatures: text: str labels: List[Union[str, int]] # List[str or int] to cater for multi-class tasks id: Optional[int] = None # text_id + metadata: Optional[dict] = None @dataclass @@ -18,6 +19,7 @@ class VisualClassificationFeatures: label: Union[str, int] geo_info: Optional[List[float]] = None #[Longitude, Latitude] id: Optional[int] = None # image_id + metadata: Optional[dict] = None @dataclass @@ -28,6 +30,7 @@ class VisualDetectionFeatures: bboxes: List[List[float]] geo_info: Optional[List[float]] = None #[Longitude, Latitude] id: Optional[int] = None # image_id + metadata: Optional[dict] = None @dataclass @@ -38,3 +41,4 @@ class VisualSegmentationFeatures: polygons: List[List[List[float]]] geo_info: Optional[List[float]] = None #[Longitude, Latitude] id: Optional[int] = None # image_id + metadata: Optional[dict] = None diff --git a/clarifai/datasets/upload/image.py b/clarifai/datasets/upload/image.py index b1ef7508..8402631b 100644 --- a/clarifai/datasets/upload/image.py +++ b/clarifai/datasets/upload/image.py @@ -32,7 +32,10 @@ def process_datagen_item(id): list) else [datagen_item.label] # clarifai concept input_id = f"{self.dataset_id}-{self.split}-{id}" if datagen_item.id is None else f"{self.dataset_id}-{self.split}-{str(datagen_item.id)}" geo_info = datagen_item.geo_info - metadata.update({"filename": os.path.basename(image_path), "split": self.split}) + if datagen_item.metadata is not None: + metadata.update(datagen_item.metadata) + else: + metadata.update({"filename": os.path.basename(image_path), "split": self.split}) self.all_input_ids[id] = input_id input_protos.append( @@ -76,7 +79,10 @@ def process_datagen_item(id): labels = datagen_item.classes # list:[l1,...,ln] bboxes = datagen_item.bboxes # [[xmin,ymin,xmax,ymax],...,[xmin,ymin,xmax,ymax]] input_id = f"{self.dataset_id}-{self.split}-{id}" if datagen_item.id is None else f"{self.dataset_id}-{self.split}-{str(datagen_item.id)}" - metadata.update({"filename": os.path.basename(image), "split": self.split}) + if datagen_item.metadata is not None: + metadata.update(datagen_item.metadata) + else: + metadata.update({"filename": os.path.basename(image), "split": self.split}) geo_info = datagen_item.geo_info self.all_input_ids[id] = input_id @@ -126,7 +132,10 @@ def process_datagen_item(id): labels = datagen_item.classes _polygons = datagen_item.polygons # list of polygons: [[[x,y],...,[x,y]],...] input_id = f"{self.dataset_id}-{self.split}-{id}" if datagen_item.id is None else f"{self.dataset_id}-{self.split}-{str(datagen_item.id)}" - metadata.update({"filename": os.path.basename(image), "split": self.split}) + if datagen_item.metadata is not None: + metadata.update(datagen_item.metadata) + else: + metadata.update({"filename": os.path.basename(image), "split": self.split}) geo_info = datagen_item.geo_info self.all_input_ids[id] = input_id diff --git a/clarifai/datasets/upload/text.py b/clarifai/datasets/upload/text.py index ff6073e8..96e8bf30 100644 --- a/clarifai/datasets/upload/text.py +++ b/clarifai/datasets/upload/text.py @@ -31,7 +31,10 @@ def process_datagen_item(id): labels = datagen_item.labels if isinstance( datagen_item.labels, list) else [datagen_item.labels] # clarifai concept input_id = f"{self.dataset_id}-{self.split}-{id}" if datagen_item.id is None else f"{self.dataset_id}-{self.split}-{str(datagen_item.id)}" - metadata.update({"split": self.split}) + if datagen_item.metadata is not None: + metadata.update(datagen_item.metadata) + else: + metadata.update({"split": self.split}) self.all_input_ids[id] = input_id input_protos.append(