From a0143ac73dee7ad08f7d6568321a2f21bb691561 Mon Sep 17 00:00:00 2001 From: Herumb Shandilya Date: Thu, 15 Feb 2024 01:01:57 +0530 Subject: [PATCH] Dataloader return patch and doc fixes --- dspy/datasets/dataloader.py | 27 ++- examples/dataloaders/dataloaders_dolly.ipynb | 242 ++++++++++++++----- 2 files changed, 194 insertions(+), 75 deletions(-) diff --git a/dspy/datasets/dataloader.py b/dspy/datasets/dataloader.py index 76df4abef..ac8b1d20f 100644 --- a/dspy/datasets/dataloader.py +++ b/dspy/datasets/dataloader.py @@ -1,3 +1,4 @@ +import dspy from dspy.datasets import Dataset from typing import Union, List @@ -11,7 +12,7 @@ def _process_dataset( self, dataset: Dataset, fields: List[str] = None - ): + ) -> List[dspy.Example]: if not(self.train_size and self.dev_size and self.test_size): self.train_size = 1.0 @@ -42,18 +43,26 @@ def _process_dataset( dev_dataset = tmp_dataset["train"] test_dataset = tmp_dataset["test"] + returned_split = {} if train_split_size: self._train = [{field:row[field] for field in fields} for row in train_dataset] + self.train_size = train_split_size + + returned_split["train"] = self._shuffle_and_sample("train", self._train, self.train_size, self.train_seed) if dev_split_size: self._dev = [{field:row[field] for field in fields} for row in dev_dataset] + self.dev_size = dev_split_size + + returned_split["dev"] = self._shuffle_and_sample("dev", self._dev, self.dev_size, self.dev_seed) if test_split_size: self._test = [{field:row[field] for field in fields} for row in test_dataset] - - self.train_size = train_split_size - self.dev_size = dev_split_size - self.test_size = test_split_size + self.test_size = test_split_size + + returned_split["test"] = self._shuffle_and_sample("test", self._test, self.test_size, self.test_seed) + + return returned_split def from_huggingface( self, @@ -61,7 +70,7 @@ def from_huggingface( fields: List[str] = None, splits: Union[str, List[str]] = None, revision: str = None, - ): + ) -> List[dspy.Example]: dataset = None if splits: if isinstance(splits, str): @@ -86,12 +95,12 @@ def from_huggingface( if not fields: fields = list(dataset.features) - self._process_dataset(dataset, fields) + return self._process_dataset(dataset, fields) - def from_csv(self, file_path:str, fields: List[str] = None): + def from_csv(self, file_path:str, fields: List[str] = None) -> List[dspy.Example]: dataset = load_dataset("csv", data_files=file_path)["train"] if not fields: fields = list(dataset.features) - self._process_dataset(dataset, fields) \ No newline at end of file + return self._process_dataset(dataset, fields) \ No newline at end of file diff --git a/examples/dataloaders/dataloaders_dolly.ipynb b/examples/dataloaders/dataloaders_dolly.ipynb index fc86373f8..e7831e682 100644 --- a/examples/dataloaders/dataloaders_dolly.ipynb +++ b/examples/dataloaders/dataloaders_dolly.ipynb @@ -39,18 +39,163 @@ "base_uri": "https://localhost:8080/" }, "id": "XZ2MimQMkyjA", - "outputId": "23d4ceb5-a65c-4165-dd19-b3f44a4d5d10" + "outputId": "dc046d83-1dc7-437b-ea34-201a62efb072" }, "outputs": [ { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pkg_resources'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# Set up the cache for this notebook\u001b[39;00m\n\u001b[1;32m 18\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDSP_NOTEBOOK_CACHEDIR\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(repo_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcache\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;66;03m# Install the package if it's not installed\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdspy-ai\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m {pkg\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;28;01mfor\u001b[39;00m pkg \u001b[38;5;129;01min\u001b[39;00m pkg_resources\u001b[38;5;241m.\u001b[39mworking_set}:\n\u001b[1;32m 22\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip install -U pip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'" + "name": "stdout", + "output_type": "stream", + "text": [ + "fatal: cannot change to 'dspy': No such file or directory\n", + "Cloning into 'dspy'...\n", + "remote: Enumerating objects: 23100, done.\u001b[K\n", + "remote: Counting objects: 100% (875/875), done.\u001b[K\n", + "remote: Compressing objects: 100% (404/404), done.\u001b[K\n", + "remote: Total 23100 (delta 560), reused 714 (delta 467), pack-reused 22225\u001b[K\n", + "Receiving objects: 100% (23100/23100), 24.57 MiB | 21.23 MiB/s, done.\n", + "Resolving deltas: 100% (8919/8919), done.\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.10/dist-packages (23.1.2)\n", + "Collecting pip\n", + " Downloading pip-24.0-py3-none-any.whl (2.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 23.1.2\n", + " Uninstalling pip-23.1.2:\n", + " Successfully uninstalled pip-23.1.2\n", + "Successfully installed pip-24.0\n", + "Obtaining file:///content/dspy\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Checking if build backend supports build_editable ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build editable ... \u001b[?25l\u001b[?25hdone\n", + " Preparing editable metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting backoff~=2.2.1 (from dspy-ai==2.1.10)\n", + " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: joblib~=1.3.2 in /usr/local/lib/python3.10/dist-packages (from dspy-ai==2.1.10) (1.3.2)\n", + "Collecting openai<2.0.0,>=0.28.1 (from dspy-ai==2.1.10)\n", + " Downloading openai-1.12.0-py3-none-any.whl.metadata (18 kB)\n", + "Collecting pandas~=2.1.1 (from dspy-ai==2.1.10)\n", + " Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", + "Collecting regex~=2023.10.3 (from dspy-ai==2.1.10)\n", + " Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting ujson~=5.8.0 (from dspy-ai==2.1.10)\n", + " Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)\n", + "Requirement already satisfied: tqdm~=4.66.1 in /usr/local/lib/python3.10/dist-packages (from dspy-ai==2.1.10) (4.66.1)\n", + "Collecting datasets~=2.14.6 (from dspy-ai==2.1.10)\n", + " Downloading datasets-2.14.7-py3-none-any.whl.metadata (19 kB)\n", + "Requirement already satisfied: requests~=2.31.0 in /usr/local/lib/python3.10/dist-packages (from dspy-ai==2.1.10) (2.31.0)\n", + "Collecting optuna~=3.4.0 (from dspy-ai==2.1.10)\n", + " Downloading optuna-3.4.0-py3-none-any.whl.metadata (17 kB)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (1.25.2)\n", + "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (10.0.1)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (0.6)\n", + "Collecting dill<0.3.8,>=0.3.0 (from datasets~=2.14.6->dspy-ai==2.1.10)\n", + " Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (3.4.1)\n", + "Collecting multiprocess (from datasets~=2.14.6->dspy-ai==2.1.10)\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n", + "Requirement already satisfied: fsspec<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets~=2.14.6->dspy-ai==2.1.10) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (3.9.3)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (0.20.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets~=2.14.6->dspy-ai==2.1.10) (6.0.1)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (1.7.0)\n", + "Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10)\n", + " Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n", + "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (2.6.1)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (1.3.0)\n", + "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (4.9.0)\n", + "Collecting alembic>=1.5.0 (from optuna~=3.4.0->dspy-ai==2.1.10)\n", + " Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)\n", + "Collecting colorlog (from optuna~=3.4.0->dspy-ai==2.1.10)\n", + " Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n", + "Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna~=3.4.0->dspy-ai==2.1.10) (2.0.25)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas~=2.1.1->dspy-ai==2.1.10) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas~=2.1.1->dspy-ai==2.1.10) (2023.4)\n", + "Collecting tzdata>=2022.1 (from pandas~=2.1.1->dspy-ai==2.1.10)\n", + " Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests~=2.31.0->dspy-ai==2.1.10) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests~=2.31.0->dspy-ai==2.1.10) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests~=2.31.0->dspy-ai==2.1.10) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests~=2.31.0->dspy-ai==2.1.10) (2024.2.2)\n", + "Collecting Mako (from alembic>=1.5.0->optuna~=3.4.0->dspy-ai==2.1.10)\n", + " Downloading Mako-1.3.2-py3-none-any.whl.metadata (2.9 kB)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (1.2.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets~=2.14.6->dspy-ai==2.1.10) (4.0.3)\n", + "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai==2.1.10)\n", + " Downloading httpcore-1.0.3-py3-none-any.whl.metadata (20 kB)\n", + "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai==2.1.10)\n", + " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets~=2.14.6->dspy-ai==2.1.10) (3.13.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai<2.0.0,>=0.28.1->dspy-ai==2.1.10) (2.16.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas~=2.1.1->dspy-ai==2.1.10) (1.16.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna~=3.4.0->dspy-ai==2.1.10) (3.0.3)\n", + "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting multiprocess (from datasets~=2.14.6->dspy-ai==2.1.10)\n", + " Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n", + "Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from Mako->alembic>=1.5.0->optuna~=3.4.0->dspy-ai==2.1.10) (2.1.5)\n", + "Downloading datasets-2.14.7-py3-none-any.whl (520 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m520.4/520.4 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading openai-1.12.0-py3-none-any.whl (226 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading optuna-3.4.0-py3-none-any.whl (409 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m409.6/409.6 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m773.9/773.9 kB\u001b[0m \u001b[31m33.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.9/53.9 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading alembic-1.13.1-py3-none-any.whl (233 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.4/233.4 kB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpcore-1.0.3-py3-none-any.whl (77 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.4/345.4 kB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n", + "Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading Mako-1.3.2-py3-none-any.whl (78 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.7/78.7 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hBuilding wheels for collected packages: dspy-ai\n", + " Building editable for dspy-ai (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for dspy-ai: filename=dspy_ai-2.1.10-0.editable-py3-none-any.whl size=15381 sha256=a0cada2c73c33f1b3d7cc02530f8afefc53546f944a124b42a70971c7d2c2cf8\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-rtag8hdv/wheels/80/c7/dd/82bee9a1efa54dd8a664012a2537077eafe45c01781e3e9d68\n", + "Successfully built dspy-ai\n", + "Installing collected packages: ujson, tzdata, regex, Mako, h11, dill, colorlog, backoff, pandas, multiprocess, httpcore, alembic, optuna, httpx, openai, datasets, dspy-ai\n", + " Attempting uninstall: regex\n", + " Found existing installation: regex 2023.12.25\n", + " Uninstalling regex-2023.12.25:\n", + " Successfully uninstalled regex-2023.12.25\n", + " Attempting uninstall: pandas\n", + " Found existing installation: pandas 1.5.3\n", + " Uninstalling pandas-1.5.3:\n", + " Successfully uninstalled pandas-1.5.3\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "lida 0.0.10 requires fastapi, which is not installed.\n", + "lida 0.0.10 requires kaleido, which is not installed.\n", + "lida 0.0.10 requires python-multipart, which is not installed.\n", + "lida 0.0.10 requires uvicorn, which is not installed.\n", + "llmx 0.0.15a0 requires cohere, which is not installed.\n", + "llmx 0.0.15a0 requires tiktoken, which is not installed.\n", + "bigframes 0.20.1 requires pandas<2.1.4,>=1.5.0, but you have pandas 2.1.4 which is incompatible.\n", + "google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.1.4 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed Mako-1.3.2 alembic-1.13.1 backoff-2.2.1 colorlog-6.8.2 datasets-2.14.7 dill-0.3.7 dspy-ai-2.1.10 h11-0.14.0 httpcore-1.0.3 httpx-0.26.0 multiprocess-0.70.15 openai-1.12.0 optuna-3.4.0 pandas-2.1.4 regex-2023.10.3 tzdata-2024.1 ujson-5.8.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" ] } ], @@ -99,20 +244,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": { "id": "UeB7AEvhshfe" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/krypticmouse/.cache/pypoetry/virtualenvs/dspy-74wouE_3-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import dspy\n", "from dspy.datasets import DataLoader\n", @@ -143,23 +279,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": { "id": "OslWdJbMvVYU" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading readme: 100%|██████████| 8.20k/8.20k [00:00<00:00, 10.7MB/s]\n", - "Downloading data: 100%|██████████| 13.1M/13.1M [00:20<00:00, 649kB/s]\n", - "Generating train split: 15011 examples [00:00, 518904.00 examples/s]\n" - ] - } - ], + "outputs": [], "source": [ - "dl.from_huggingface(\n", + "dolly_dataset = dl.from_huggingface(\n", " \"databricks/databricks-dolly-15k\",\n", " fields=[\"instruction\", \"context\", \"response\"],\n", " splits=\"train\" # or splits = [\"train\", \"test\"]\n", @@ -172,24 +298,25 @@ "id": "RRyYfcTUF121" }, "source": [ - "Once the above code is executed the data would be populated in the `train`, `dev`, and `test` properties of the object:" + "Once the above code is executed the data would be returned in a dict with keys `train`, `dev`, and `test` containing the List of `Example` for each split based on the configuration of size and seed set in the object during initialization:" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CIj3Tkj6zWTq", - "outputId": "6ff849e1-c6f5-453c-d328-ce78a24e15a9" + "outputId": "d9529450-1232-4c70-b8d8-c5171ea9ab43" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Keys present in the returned dict: ['train', 'dev', 'test']\n", "Number of examples in train set: 7505\n", "Number of examples in dev set: 2251\n", "Number of examples in test set: 1501\n" @@ -197,13 +324,11 @@ } ], "source": [ - "dolly_train_set = dl.train\n", - "dolly_dev_set = dl.dev\n", - "dolly_test_set = dl.test\n", + "print(f\"Keys present in the returned dict: {list(dolly_dataset.keys())}\")\n", "\n", - "print(f\"Number of examples in train set: {len(dolly_train_set)}\")\n", - "print(f\"Number of examples in dev set: {len(dolly_dev_set)}\")\n", - "print(f\"Number of examples in test set: {len(dolly_test_set)}\")" + "print(f\"Number of examples in train set: {len(dolly_dataset['train'])}\")\n", + "print(f\"Number of examples in dev set: {len(dolly_dataset['dev'])}\")\n", + "print(f\"Number of examples in test set: {len(dolly_dataset['test'])}\")" ] }, { @@ -222,25 +347,17 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": { "id": "tkK14QOVQwph" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating train split: 100 examples [00:00, 27541.56 examples/s]\n" - ] - } - ], + "outputs": [], "source": [ "dl = DataLoader(\n", " train_size = 1.0, # Must be float\n", ")\n", "\n", - "dl.from_csv(\n", + "dolly_100_dataset = dl.from_csv(\n", " \"dolly_subset_100_rows.csv\",\n", " fields=[\"instruction\", \"context\", \"response\"],\n", ")" @@ -252,32 +369,33 @@ "id": "q2Zcn1X2RPr_" }, "source": [ - "Once the above code is executed the data would be populated in the `train`, `dev`, and `test` properties of the object:" + "Once the above code is executed the data would be returned in a dict with keys `train`, `dev`, and `test` containing the List of `Example` for each split based on the configuration of size and seed set in the object during initialization" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TIrNy_wrQ4B5", - "outputId": "cd16d1d1-519a-4bbb-9c79-b89ce2f8e861" + "outputId": "0d44d6b5-b72e-48af-939b-3938060e5b37" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Keys present in the returned dict: ['train']\n", "Number of examples in train set: 100\n" ] } ], "source": [ - "dolly_train_set = dl.train\n", + "print(f\"Keys present in the returned dict: {list(dolly_100_dataset.keys())}\")\n", "\n", - "print(f\"Number of examples in train set: {len(dolly_train_set)}\")" + "print(f\"Number of examples in train set: {len(dolly_100_dataset['train'])}\")" ] }, { @@ -299,15 +417,7 @@ "name": "python3" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", "version": "3.10.12" } },