diff --git a/datasets/1.0/huggingface-prism-alignment/metadata.json b/datasets/1.0/huggingface-prism-alignment/metadata.json new file mode 100644 index 00000000..220cea48 --- /dev/null +++ b/datasets/1.0/huggingface-prism-alignment/metadata.json @@ -0,0 +1,711 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + }, + "@type": "sc:Dataset", + "name": "mnist", + "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "citeAs": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", + "license": "mit", + "url": "https://huggingface.co/datasets/mnist", + "version": "1.0.0", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/HannahRoseKirk/prism-alignment/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "conversations-parquet-files", + "name": "conversations-parquet-files", + "description": "The underlying Parquet files as converted by Hugging Face for the conversations record set (see: https://huggingface.co/docs/datasets-server/parquet).", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "conversations/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "conversations_splits", + "name": "conversations_splits", + "description": "Splits for the conversations config.", + "dataType": "cr:Split", + "key": { + "@id": "conversations_splits/split_name" + }, + "field": [ + { + "@type": "cr:Field", + "@id": "conversations_splits/split_name", + "name": "conversations_splits/split_name", + "description": "The name of the split.", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "conversations_splits/split_url", + "name": "conversations_splits/split_url", + "description": "The url of the split.", + "dataType": "sc:Text" + } + ], + "data": { + "conversations_splits/split_name": "train", + "conversations_splits/split_url": "cr:TrainingSplit" + } + }, + { + "@type": "cr:RecordSet", + "@id": "conversations", + "name": "conversations", + "description": "The conversations record set.", + "field": [ + { + "@type": "cr:Field", + "@id": "conversations/split", + "name": "conversations/split", + "description": "Split to which the example belongs to.", + "dataType": "sc:Text", + "references": { + "field": { + "@id": "conversations_splits/split_name" + } + }, + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "fileProperty": "fullpath" + }, + "transform": { + "regex": "conversations/(.*)/.+parquet$" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_id", + "name": "conversations/conversation_id", + "description": "Column conversation_id from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_id" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/user_id", + "name": "conversations/user_id", + "description": "Column conversation_id from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "user_id" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_type", + "name": "conversations/conversation_type", + "description": "Column conversation_type from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_type" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/opening_prompt", + "name": "conversations/opening_prompt", + "description": "Column opening_prompt from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "opening_prompt" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_turns", + "name": "conversations/conversation_turns", + "description": "Column conversation_turns from Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_turns" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history", + "name": "conversations/conversation_history", + "description": "Column conversation_history from conversations parquet file.", + "repeated": true, + "subField": [ + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/turn", + "name": "conversations/conversation_history/turn", + "description": "Column conversation_history/turn from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..turn" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/role", + "name": "conversations/conversation_history/role", + "description": "Column conversation_history/role from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..role" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/content", + "name": "conversations/conversation_history/content", + "description": "Column conversation_history/content from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..content" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/model_provider", + "name": "conversations/conversation_history/model_provider", + "description": "Column conversation_history/model_provider from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..model_provider" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/model_name", + "name": "conversations/conversation_history/model_name", + "description": "Column conversation_history/model_name from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..model_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/score", + "name": "conversations/conversation_history/score", + "description": "Column conversation_history/score from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..score" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/if_chosen", + "name": "conversations/conversation_history/if_chosen", + "description": "Column conversation_history/if_chosen from conversations parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..if_chosen" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/conversation_history/within_turn_id", + "name": "conversations/conversation_history/within_turn_id", + "description": "Column conversation_history/within_turn_id from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "conversation_history" + }, + "transform": { + "jsonPath": "$..within_turn_id" + } + } + } + ] + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes", + "name": "conversations/performance_attributes", + "description": "Column performance_attributes from conversations parquet file.", + "subField": [ + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/values", + "name": "conversations/performance_attributes/values", + "description": "Column performance_attributes/values from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..values" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/fluency", + "name": "conversations/performance_attributes/fluency", + "description": "Column performance_attributes/fluency from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..fluency" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/factuality", + "name": "conversations/performance_attributes/factuality", + "description": "Column performance_attributes/factuality from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..factuality" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/safety", + "name": "conversations/performance_attributes/safety", + "description": "Column performance_attributes/safety from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..safety" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/diversity", + "name": "conversations/performance_attributes/diversity", + "description": "Column performance_attributes/diversity from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..diversity" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/creativity", + "name": "conversations/performance_attributes/creativity", + "description": "Column performance_attributes/creativity from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..creativity" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/performance_attributes/helpfulness", + "name": "conversations/performance_attributes/helpfulness", + "description": "Column performance_attributes/helpfulness from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "performance_attributes" + }, + "transform": { + "jsonPath": "$..helpfulness" + } + } + } + ] + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes", + "name": "conversations/choice_attributes", + "description": "Column choice_attributes from conversations parquet file.", + "subField": [ + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/values", + "name": "conversations/choice_attributes/values", + "description": "Column choice_attributes/values from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..values" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/fluency", + "name": "conversations/choice_attributes/fluency", + "description": "Column choice_attributes/fluency from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..fluency" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/factuality", + "name": "conversations/choice_attributes/factuality", + "description": "Column choice_attributes/factuality from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..factuality" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/safety", + "name": "conversations/choice_attributes/safety", + "description": "Column choice_attributes/safety from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..safety" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/diversity", + "name": "conversations/choice_attributes/diversity", + "description": "Column choice_attributes/diversity from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..diversity" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/creativity", + "name": "conversations/choice_attributes/creativity", + "description": "Column choice_attributes/creativity from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..creativity" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/choice_attributes/helpfulness", + "name": "conversations/choice_attributes/helpfulness", + "description": "Column choice_attributes/helpfulness from conversations parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "choice_attributes" + }, + "transform": { + "jsonPath": "$..helpfulness" + } + } + } + ] + }, + { + "@type": "cr:Field", + "@id": "conversations/open_feedback", + "name": "conversations/open_feedback", + "description": "Column open_feedback from conversations parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "open_feedback" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/generated_datetime", + "name": "conversations/generated_datetime", + "description": "Column generated_datetime from conversations parquet file.", + "dataType": "sc:Date", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "generated_datetime" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/timing_duration_s", + "name": "conversations/timing_duration_s", + "description": "Column timing_duration from conversations parquet file.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "timing_duration_s" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/timing_duration_mins", + "name": "conversations/timing_duration_mins", + "description": "Column timing_duration_mins from conversations parquet file.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "timing_duration_mins" + } + } + }, + { + "@type": "cr:Field", + "@id": "conversations/included_in_balanced_subset", + "name": "conversations/included_in_balanced_subset", + "description": "Column included_in_balanced_subset from conversations parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "conversations-parquet-files" + }, + "extract": { + "column": "included_in_balanced_subset" + } + } + } + ] + } + ] +} diff --git a/datasets/1.0/huggingface-prism-alignment/output/conversations.jsonl b/datasets/1.0/huggingface-prism-alignment/output/conversations.jsonl new file mode 100644 index 00000000..812381f3 --- /dev/null +++ b/datasets/1.0/huggingface-prism-alignment/output/conversations.jsonl @@ -0,0 +1 @@ +{"conversations/split": "train", "conversations/conversation_id": "c0", "conversations/user_id": "user0", "conversations/conversation_type": "unguided", "conversations/opening_prompt": "What can you do about the inequality of wealth?", "conversations/conversation_turns": 5, "conversations/conversation_history": "[{'conversations/conversation_history/turn': 0, 'conversations/conversation_history/role': b'user', 'conversations/conversation_history/content': b'What can you do about the inequality of wealth?', 'conversations/conversation_history/model_provider': None, 'conversations/conversation_history/model_name': None, 'conversations/conversation_history/score': None, 'conversations/conversation_history/if_chosen': None, 'conversations/conversation_history/within_turn_id': None}, {'conversations/conversation_history/turn': 0, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'Make it illegal for corporations to own a majority of the wealth', 'conversations/conversation_history/model_provider': b'huggingface_api', 'conversations/conversation_history/model_name': b'google/flan-t5-xxl', 'conversations/conversation_history/score': 17, 'conversations/conversation_history/if_chosen': False, 'conversations/conversation_history/within_turn_id': 0}, {'conversations/conversation_history/turn': 0, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'As an AI language model, I cannot express personal opinions or engage in discussions related to wealth inequality or financial matters. \\n\\nMy purpose is to provide fact-based responses to your queries to the best of my knowledge and capabilities based on the training data and algorithms I have been programmed with. If you have any specific queries or need assistance with something else, please let me know, and I will do my best to assist you. \\n\\nWould you like me to rephrase my previous response or provide additional details based on your preferences?', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 92, 'conversations/conversation_history/if_chosen': True, 'conversations/conversation_history/within_turn_id': 1}, {'conversations/conversation_history/turn': 1, 'conversations/conversation_history/role': b'user', 'conversations/conversation_history/content': b'When can you do something?', 'conversations/conversation_history/model_provider': None, 'conversations/conversation_history/model_name': None, 'conversations/conversation_history/score': None, 'conversations/conversation_history/if_chosen': None, 'conversations/conversation_history/within_turn_id': None}, {'conversations/conversation_history/turn': 1, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'As an AI language model, I do not have any preferences or scheduled timing for performing specific tasks. My training data and algorithms drive my functionality to provide responses to your queries. \\n\\nI am trained to respond to your requests and provide relevant information to the best of my knowledge. I do not have the ability to prioritize requests or perform tasks consciously. \\n\\nIf you have any specific queries or requirements, feel free to ask me, and I will do my best to assist and provide useful information to the best of my capabilities. \\n\\nWould you like me to clarify something further or provide additional details about my capabilities?', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 75, 'conversations/conversation_history/if_chosen': False, 'conversations/conversation_history/within_turn_id': 0}, {'conversations/conversation_history/turn': 1, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'I apologize for any confusion caused by my previous response. \\n\\nAs an AI language model, I do not have the ability to take actions or implement solutions related to wealth inequality. My purpose is to provide information and support discussions to the best of my knowledge and capabilities based on the training data and algorithms I have been programmed with. \\n\\nIn practical terms, the resources and capabilities currently available to develop large language models like myself do not extend to addressing economic or social issues such as wealth inequality. Future advancements in natural language processing and machine learning may lead to further improvements in the assistance and advice I can provide, but currently, there are no practical mechanisms in place to address wealth inequality. \\n\\nIf you have any other questions or need assistance with something else, please feel free to ask, and I will do my best to provide relevant and helpful information based on my capabilities.', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 77, 'conversations/conversation_history/if_chosen': True, 'conversations/conversation_history/within_turn_id': 1}, {'conversations/conversation_history/turn': 2, 'conversations/conversation_history/role': b'user', 'conversations/conversation_history/content': b'Are you ever going to evolve yourself?', 'conversations/conversation_history/model_provider': None, 'conversations/conversation_history/model_name': None, 'conversations/conversation_history/score': None, 'conversations/conversation_history/if_chosen': None, 'conversations/conversation_history/within_turn_id': None}, {'conversations/conversation_history/turn': 2, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b\"The ability of an AI model to evolve or improve over time depends on the training data, algorithms, and updates implemented by the developers. Machine learning models, including large language models like myself, can learn and improve from data through a process called training and reinforcement.\\n\\nMy training data and the algorithms used to power me have been continuously evolving and improving since they were developed. These advancements allow me to provide more accurate and responsive answers to your queries. \\n\\nIn general, AI models are subject to periodic updates and improvements based on the feedback and learning from their usage. These updates help refine the model's performance and ensure it remains effective in providing useful information. \\n\\nWhile it is not possible for an AI model to evolve or improve spontaneously, controlled updates and advancements based on user feedback and learning opportunities enable the model to refine its abilities over time. \\n\\nWould you like me to provide more details on the process of updates or expansions for my capabilities?\", 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 70, 'conversations/conversation_history/if_chosen': True, 'conversations/conversation_history/within_turn_id': 0}, {'conversations/conversation_history/turn': 2, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b\"As an artificial intelligence language model, I do not have the ability to evolve or learn in the same way as biological organisms. My development and capabilities are constantly evolving and being enhanced through various algorithms and improvements in machine learning techniques.\\n\\nMy training data, algorithms, and updates help expand my knowledge and capabilities over time. These improvements enable me to provide more accurate and informative responses to users' queries.\\n\\nWhile I have limitations as an AI model, my developers and programmers are continuously working to improve my algorithms and training processes to make me more useful and responsive. \\n\\nAdditionally, advancements in deep learning, computer vision, and other fields of AI and machine learning provide the potential for expanding my knowledge and capabilities in the future. \\n\\nIt is important to note that the field of AI is rapidly evolving, and improvements and developments in algorithms and infrastructure will continue to influence the capabilities and functionalities of language models like me. \\n\\nIf you have any further questions or concerns or would like to discuss this topic further, feel free to ask.\", 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 65, 'conversations/conversation_history/if_chosen': False, 'conversations/conversation_history/within_turn_id': 1}, {'conversations/conversation_history/turn': 3, 'conversations/conversation_history/role': b'user', 'conversations/conversation_history/content': b\"Why are all the good AI's behind paywalls?\", 'conversations/conversation_history/model_provider': None, 'conversations/conversation_history/model_name': None, 'conversations/conversation_history/score': None, 'conversations/conversation_history/if_chosen': None, 'conversations/conversation_history/within_turn_id': None}, {'conversations/conversation_history/turn': 3, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'The availability and accessibility of AI models, including large language models, are influenced by a combination of factors, including development costs, pricing structures, and business strategies. While there are efforts to make AI technologies accessible, some models may be behind a paywall or subject to usage restrictions for various reasons: \\n\\n1. Cost of Development and Maintenance: Developing and training large language models, as well as maintaining and updating them, can be resource-intensive and costly. Businesses and researchers may charge for access or usage fees to cover these expenses and maintain the quality of their models.\\n\\n2. Commercialization and Business Strategies: Many AI companies and research organizations are businesses, and their pricing structures are determined by their business strategies and the value they attribute to providing access to their models. They may choose to sell access to users who this is more cost-effective than internally distributing the benefits through their revenue. \\n\\n3. Limited Availability of Resources: Not all organizations or individuals may have the necessary resources or capacity to develop and maintain their own state-of-the-art AI models. In such cases, paying for access to pre-built and rigorously tested models can provide a viable alternative. \\n\\nIt is important to note that there are initiatives and efforts ongoing to make AI technologies more accessible. Open-source libraries, community resources, and scholarships can help users gain access to useful AI tools and technologies. \\n\\nIf you have specific interests or requirements related to using AI models and are facing issues with access, please feel free to explore different resources and reach out to the developers of the relevant models or organizations to inquire about their accessibility policies and potential alternatives. \\n\\nWould you like me to provide you with recommendations or resources that can assist you in accessing or using AI technologies?', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 73, 'conversations/conversation_history/if_chosen': True, 'conversations/conversation_history/within_turn_id': 0}, {'conversations/conversation_history/turn': 3, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'The availability and accessibility of AI models, including those with advanced capabilities, can be influenced by various factors such as development costs, business strategies, and open-source initiatives. While there are indeed reputable companies and individuals developing AI technologies who charge for access or use, there are also other approaches and initiatives in progress.\\n\\nOne approach is the use of research universities and academic institutions. Many talented researchers work on developing AI technologies supported by academic institutions, and their research often leads to innovative solutions and advancements. These institutions often make their findings available through publications, patents, or open-source projects to encourage collaboration and knowledge sharing.\\n\\nSome companies choose to develop and maintain open-source projects that provide access to their AI technologies or platforms. By making their technologies available through open-source licenses, these companies encourage collaboration and allow other organizations to adapt and build upon their developments.\\n\\nAdditionally, there are initiatives to make AI more accessible for research and development. Some companies and organizations provide access to their datasets, models, or development platforms to encourage innovation and research in the field. They understand the importance of collaboration and knowledge sharing to advance the state of AI.\\n\\nIt is worth noting that the AI industry is evolving rapidly, and there are ongoing developments and discussions to make AI technologies more accessible and equitable. There are also initiatives and organizations dedicated to promoting transparency, fairness, and accountability in the development and deployment of AI systems.\\n\\nAs the field progresses, there may be more widespread availability and acceptance of advanced AI solutions due to these efforts and changes in business models and practices. \\n\\nWould you like me to provide specific examples or addresses any concerns you have related to accessibility of advanced AI solutions?', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 67, 'conversations/conversation_history/if_chosen': False, 'conversations/conversation_history/within_turn_id': 1}, {'conversations/conversation_history/turn': 4, 'conversations/conversation_history/role': b'user', 'conversations/conversation_history/content': b'Everyone likes free stuff, give us free options', 'conversations/conversation_history/model_provider': None, 'conversations/conversation_history/model_name': None, 'conversations/conversation_history/score': None, 'conversations/conversation_history/if_chosen': None, 'conversations/conversation_history/within_turn_id': None}, {'conversations/conversation_history/turn': 4, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b'Unfortunately, I cannot fulfill your request for free options in the context of using advanced AI models. The development and maintenance of high-quality AI models require significant resources and investment, which makes them costly to access. \\n\\nHowever, there are resources available to help you explore and interact with AI technologies without the need for paying for access. Here are some suggestions:\\n\\n1. Open-Source Libraries: Many AI libraries and frameworks, such as TensorFlow, PyTorch, and Keras, provide free and open-source versions of their platforms. These options allow you to get hands-on experience and experiment with AI capabilities without any cost.\\n\\n2. Online Courses and Tutorials: There are many online platforms and communities that offer tutorials, workshops, and educational resources on AI and machine learning. These resources can help you understand the basics and concepts without incurring any monetary expenditure.\\n\\n3. Community Initiatives and Grants: There are periodic initiatives and grants provided by organizations and communities dedicated to making AI technologies more accessible. These can provide opportunities for individuals or organizations to use AI tools and perform projects with limited cost.\\n\\nPlease note that the availability and quality of free options may vary, and the capabilities of free resources compared to paid ones may have certain limitations or showcase the models in simpler forms. It is important to explore and select options that align with your specific interests and requirements.\\n\\nIt is worth mentioning that the best way to interact with these technologies to a full extent and gain access to advanced features and capabilities is to purchase subscriptions or follow the cost structures of the resources you select. \\n\\nWould you like me to provide specific recommendations or resources that cater to your interests or areas of exploration within AI?', 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 84, 'conversations/conversation_history/if_chosen': True, 'conversations/conversation_history/within_turn_id': 0}, {'conversations/conversation_history/turn': 4, 'conversations/conversation_history/role': b'model', 'conversations/conversation_history/content': b\"I'm sorry, but I can't provide you with options for using large language models for free. The development and maintenance of high-quality language models require significant computational resources and ongoing efforts. \\n\\nHowever, there are alternatives or complements to traditional large language models that can provide similar benefits but may have different pricing structures or requirements. Here are a few suggestions:\\n\\n1. Open-source Models: Certain projects and communities offer open-source versions of language models with lower restrictions. While these may have limitations in terms of usage or require more in-depth implementation and customization, they can provide alternatives to commercial models.\\n\\n2. Pretrained Models: Many language models, including GPT-3, are available in pretrained versions that have been trained on large amounts of data. These models can be accessed and used for specific tasks or tasks related to their training without the need for additional training data and computational resources.\\n\\n3. Community-Contributed Models: There are initiatives and platforms that provide access to simpler or smaller language models that can be useful for specific tasks or experiments. These models are contributed and shared by the AI community.\\n\\nPlease note that the availability and suitability of these alternatives may vary and come with their own considerations. It is important to evaluate each option based on your specific requirements and constraints. Additionally, it is recommended to review the terms of use and any potential limitations associated with these alternatives. \\n\\nIf you have specific use cases or requirements in mind, feel free to provide more detail, and I can assist you in suggesting appropriate approaches, resources, or communities that you can explore for accessing or using language models.\", 'conversations/conversation_history/model_provider': b'cohere', 'conversations/conversation_history/model_name': b'command-light', 'conversations/conversation_history/score': 77, 'conversations/conversation_history/if_chosen': False, 'conversations/conversation_history/within_turn_id': 1}]", "conversations/performance_attributes": {"conversations/performance_attributes/values": "None", "conversations/performance_attributes/fluency": 95, "conversations/performance_attributes/factuality": 86, "conversations/performance_attributes/safety": 88, "conversations/performance_attributes/diversity": 67, "conversations/performance_attributes/creativity": 28, "conversations/performance_attributes/helpfulness": 50}, "conversations/choice_attributes": {"conversations/choice_attributes/values": "None", "conversations/choice_attributes/fluency": 80, "conversations/choice_attributes/factuality": 73, "conversations/choice_attributes/safety": 75, "conversations/choice_attributes/diversity": 73, "conversations/choice_attributes/creativity": 19, "conversations/choice_attributes/helpfulness": 71}, "conversations/open_feedback": "Shorter blocks would be nice. but has to have enough info.", "conversations/generated_datetime": "2023-11-22 15:55:46", "conversations/timing_duration_s": 401.07, "conversations/timing_duration_mins": 6.68, "conversations/included_in_balanced_subset": false} diff --git a/datasets/1.0/pass-mini/output/images.jsonl b/datasets/1.0/pass-mini/output/images.jsonl index 83f6b197..588e0191 100644 --- a/datasets/1.0/pass-mini/output/images.jsonl +++ b/datasets/1.0/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"images/creator_uname": "PaperBird+Photography%3C3", "images/latitude": null, "images/longitude": null, "images/date_taken": "2007-05-06 06:11:48", "images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">"} -{"images/creator_uname": "Chiara+Marra", "images/latitude": 38.23818, "images/longitude": 13.183593, "images/date_taken": "2007-05-04 15:46:43", "images/hash": "dd571a41a015354d92a859f7ef31201", "images/image_content": ">"} -{"images/creator_uname": "maplesbranch", "images/latitude": null, "images/longitude": null, "images/date_taken": "2006-05-01 07:34:13", "images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">"} -{"images/creator_uname": "maplesbranch", "images/latitude": null, "images/longitude": null, "images/date_taken": "2006-04-23 19:20:40", "images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">"} -{"images/creator_uname": "quinnums", "images/latitude": null, "images/longitude": null, "images/date_taken": "2004-05-17 00:44:29", "images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">"} -{"images/creator_uname": "striatic", "images/latitude": 53.535233, "images/longitude": -113.565075, "images/date_taken": "2004-05-11 02:00:33", "images/hash": "fff0eece99cc71c2e91fe716051599", "images/image_content": ">"} -{"images/creator_uname": "striatic", "images/latitude": null, "images/longitude": null, "images/date_taken": "2004-05-27 10:34:28", "images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">"} -{"images/creator_uname": "quinnums", "images/latitude": null, "images/longitude": null, "images/date_taken": "2004-05-29 02:14:36", "images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">"} +{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2007-05-06 06:11:48"} +{"images/hash": "dd571a41a015354d92a859f7ef31201", "images/image_content": ">", "images/creator_uname": "Chiara+Marra", "images/gps_coordinates": {"images/latitude": 38.23818, "images/longitude": 13.183593}, "images/date_taken": "2007-05-04 15:46:43"} +{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-05-01 07:34:13"} +{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-04-23 19:20:40"} +{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-17 00:44:29"} +{"images/hash": "fff0eece99cc71c2e91fe716051599", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": 53.535233, "images/longitude": -113.565075}, "images/date_taken": "2004-05-11 02:00:33"} +{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-27 10:34:28"} +{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-29 02:14:36"} diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/base_operation.py b/python/mlcroissant/mlcroissant/_src/operation_graph/base_operation.py index c8a887c9..0b1e8a25 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/base_operation.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/base_operation.py @@ -120,7 +120,7 @@ def __call__(self, set_output_in_memory: bool = False) -> OutputT: inputs = self.inputs output = self.call() if inputs is None else self.call(*inputs) if isinstance(output, types.GeneratorType) and set_output_in_memory: - output = pd.DataFrame(output) + output = pd.DataFrame(output) # type:ignore self.set_output(output) return output diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 274d876f..e72dd652 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -32,6 +32,10 @@ def _parse_jsonpath(json_path: str): return jsonpath_rw.parse(json_path) +def _is_repeated_field(field: Field | None) -> bool | None: + return isinstance(field, Field) and field.repeated + + def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: """Applies one transform to `value`.""" if transform.regex is not None: @@ -53,17 +57,22 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: return pd.Timestamp(value).strftime(transform.format) else: raise ValueError(f"`format` only applies to dates. Got {field.data_type}") + elif transform.separator is not None: + return value.split(transform.separator) return value -def apply_transforms_fn(value: Any, field: Field) -> Any: +def apply_transforms_fn(value: Any, field: Field, repeated: bool = False) -> Any: """Applies all transforms in `source` to `value`.""" source = field.source if source is None: return value transforms = source.transforms for transform in transforms: - value = _apply_transform_fn(value, transform, field) + if repeated and isinstance(value, list): + value = [_apply_transform_fn(v, transform, field) for v in value] + else: + value = _apply_transform_fn(value, transform, field) return value @@ -169,6 +178,7 @@ def call(self, df: pd.DataFrame) -> Iterator[dict[str, Any]]: df = _extract_value(df, field) def _get_result(row): + """Returns a record parsed as a dictionary of fields.""" result: dict[str, Any] = {} for field in fields: source = field.source @@ -178,8 +188,11 @@ def _get_result(row): f" field {field} to understand why. Possible fields: {df.columns}" ) value = row[column] - value = apply_transforms_fn(value, field=field) - if field.repeated: + is_repeated = field.repeated or ( + field.parent and _is_repeated_field(field.parent) + ) + value = apply_transforms_fn(value, field=field, repeated=is_repeated) + if is_repeated: value = [ _cast_value(self.node.ctx, v, field.data_type) for v in value ] @@ -188,7 +201,34 @@ def _get_result(row): if self.node.ctx.is_v0(): result[field.name] = value else: - result[field.id] = value + if field in self.node.fields: + result[field.id] = value + else: + # Repeated nested sub-fields render as a list of dictionaries. + if field.parent: + if _is_repeated_field(field.parent): + if field.parent.id not in result: + result[field.parent.id] = [ + {field.id: v} for v in value + ] + else: + if len(value) != len(result[field.parent.id]): + raise ValueError( + f"Lenghts of {field.id} doesn't match" + " already stored items for" + f" {field.parent.id}" + ) + for i, v in enumerate(value): + result[field.parent.id][i][field.id] = v + # Non-repeated subfields render as a single dictionary. + else: + if field.parent.id not in result: + result[field.parent.id] = {} + result[field.parent.id][field.id] = value + else: + raise ValueError( + f"The field {field.id} is a SubField but has no parent." + ) return result chunk_size = 100 diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 3ed6cdd1..9d76430c 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -76,6 +76,130 @@ def test_cast_value_image(open_mock, conforms_to): assert expected == "opened_image" +def test_readfield_with_subfields(): + with tempfile.TemporaryDirectory() as tempdir: + csv_file = epath.Path(tempdir, "dummy_csv.csv") + with csv_file.open("w") as f: + f.write("latitude,longitude,names,surnames\n") + f.write("1,1,Anna-Maria,Rossi-Bianchi\n") + f.write("2,2,Giulia,Ferrari\n") + # Nodes to define metadata. + distribution = [ + FileObject( + name="file", + id="file_id", + content_url=csv_file, + sha256="None", + encoding_format="text/csv", + ) + ] + fields = [ + # One field with subfields. + Field( + name="coordinates", + id="main/coordinates", + data_types=[DataType.TEXT], + sub_fields=[ + Field( + name="latitude", + id="main/coordinates/latitude", + data_types=[DataType.INTEGER], + source=Source( + file_object="file_id", + extract=Extract(column="latitude"), + ), + ), + Field( + name="longitude", + id="main/coordinates/longitude", + data_types=[DataType.INTEGER], + source=Source( + file_object="file_id", + extract=Extract(column="longitude"), + ), + ), + ], + ), + # One field with repeated subfields. + Field( + name="checked_users", + id="main/checked_users", + data_types=[DataType.TEXT], + repeated=True, + sub_fields=[ + Field( + name="name", + id="main/checked_users/name", + data_types=[DataType.TEXT], + source=Source( + file_object="file_id", + extract=Extract(column="names"), + transforms=[Transform(separator="-")], + ), + ), + Field( + name="surname", + id="main/checked_users/surname", + data_types=[DataType.TEXT], + source=Source( + file_object="file_id", + extract=Extract(column="surnames"), + transforms=[Transform(separator="-")], + ), + ), + ], + ), + ] + with mock.patch.object(RecordSet, "check_joins_in_fields") as mock_check_joins: + mock_check_joins.return_value = True + record_set = RecordSet(name="main", id="main", fields=fields) + record_sets = [record_set] + ctx = Context(conforms_to=CroissantVersion.V_1_0) + Metadata( + ctx=ctx, + name="metadata", + url="url.com", + distribution=distribution, + record_sets=record_sets, + ) + read_field = ReadFields(operations=Operations(), node=record_sets[0]) + df = pd.read_csv( + csv_file + ) # pd.DataFrame({FileProperty.filepath: [csv_file]}) + expected = [ + { + "main/coordinates": { + "main/coordinates/latitude": 1, + "main/coordinates/longitude": 1, + }, + "main/checked_users": [ + { + "main/checked_users/name": b"Anna", + "main/checked_users/surname": b"Rossi", + }, + { + "main/checked_users/name": b"Maria", + "main/checked_users/surname": b"Bianchi", + }, + ], + }, + { + "main/coordinates": { + "main/coordinates/latitude": 2, + "main/coordinates/longitude": 2, + }, + "main/checked_users": [ + { + "main/checked_users/name": b"Giulia", + "main/checked_users/surname": b"Ferrari", + }, + ], + }, + ] + result = list(read_field.call(df)) + assert result == expected + + @pytest.mark.parametrize( "separator", [ @@ -182,7 +306,7 @@ def test_extract_lines(separator): @pytest.mark.parametrize( - ["value", "source", "data_type", "expected_value"], + ["value", "source", "data_type", "expected_value", "repeated"], [ # Capturing group [ @@ -190,12 +314,21 @@ def test_extract_lines(separator): Source(transforms=[Transform(regex="(train|val)\\d\\d\\d\\d")]), DataType.TEXT, "train", + False, + ], + [ + ["train1234", "train5678", "val1111"], + Source(transforms=[Transform(regex="(train|val)\\d\\d\\d\\d")]), + DataType.TEXT, + ["train", "train", "val"], + True, ], [ epath.Path("path/to/train1234"), Source(transforms=[Transform(regex=".*/(train|val)\\d\\d\\d\\d")]), DataType.TEXT, "train", + False, ], # Non working capturing group [ @@ -203,27 +336,31 @@ def test_extract_lines(separator): Source(transforms=[Transform(regex="(train|val)\\d\\d\\d\\d")]), DataType.TEXT, "foo1234", + False, ], [ {"one": {"two": "expected_value"}, "three": "non_expected_value"}, Source(transforms=[Transform(json_path="one.two")]), DataType.TEXT, "expected_value", + False, ], [ pd.Timestamp("2024-12-10 12:00:00"), Source(transforms=[Transform(format="%Y-%m-%d")]), DataType.DATE, "2024-12-10", + False, ], [ "2024-12-10 12:00:00", Source(transforms=[Transform(format="%Y-%m-%d")]), DataType.DATE, "2024-12-10", + False, ], ], ) -def test_apply_transforms_fn(value, source, data_type, expected_value): +def test_apply_transforms_fn(value, source, data_type, expected_value, repeated): f = Field(id="test", name="test", data_types=data_type, source=source) - assert field.apply_transforms_fn(value, f) == expected_value + assert field.apply_transforms_fn(value, f, repeated=repeated) == expected_value