From 8366cddcf49452a8f1ca859afa48ab85da16ce9a Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Wed, 13 Dec 2023 15:33:20 -0500 Subject: [PATCH 1/6] bump @pinecone-database dependency to support the new global control plane, adjust tests and code where necessary --- package-lock.json | 96 ++++++++++++++++++++---- package.json | 2 +- src/index.ts | 2 +- src/load.ts | 25 +++--- tests/integration/deleteIndex.test.ts | 18 +++-- tests/integration/semanticSearch.test.ts | 31 +++++--- 6 files changed, 131 insertions(+), 43 deletions(-) diff --git a/package-lock.json b/package-lock.json index 44595ec..c7a44d5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "0.0.1", "license": "MIT", "dependencies": { - "@pinecone-database/pinecone": "^1.0.0", + "@pinecone-database/pinecone": "^1.1.2-spruceDev.20231211000839", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3", @@ -93,6 +93,25 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, + "node_modules/@edge-runtime/primitives": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@edge-runtime/primitives/-/primitives-4.0.5.tgz", + "integrity": "sha512-t7QiN5d/KpXgCvIfSt6Nm9Hj3WVdNgc5CpOD73jasY+9EvTI7Ngdj5cXvjcHrPcmYWJZMySPgeEeoL/1N/Llag==", + "engines": { + "node": ">=16" + } + }, + "node_modules/@edge-runtime/types": { + "version": "2.2.7", + "resolved": "https://registry.npmjs.org/@edge-runtime/types/-/types-2.2.7.tgz", + "integrity": "sha512-9MTwGooICP7+ZsX9BTy6YCRzOr4tP6RFRymsc8CaKORfvuAHgLZUQaLwILfQ94tddufVXcBwq637VfEd3ZXbWA==", + "dependencies": { + "@edge-runtime/primitives": "4.0.5" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/@esbuild/android-arm": { "version": "0.18.20", "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.18.20.tgz", @@ -639,23 +658,34 @@ } }, "node_modules/@pinecone-database/pinecone": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.0.0.tgz", - "integrity": "sha512-CtsfbK4qTDjnS56FVH64FEWNVnhwOyheBlLe3e9T6o9Gaxc00f/079JWUUiZ1lrkc3K/YkmlYYOXbdGyKP2z3A==", + "version": "1.1.2-spruceDev.20231211000839", + "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.1.2-spruceDev.20231211000839.tgz", + "integrity": "sha512-id2Wau1eAP4Eh+CS+4Dg41Zh1gsSG828Li8V4FueHRS81WuIZ0WE37HENG6FUVijVeteG8y19I360rqYJEceHQ==", "dependencies": { - "@sinclair/typebox": "^0.28.15", - "@types/web": "^0.0.99", + "@edge-runtime/types": "^2.2.3", + "@sinclair/typebox": "^0.29.0", + "@types/node": "^18.11.17", "ajv": "^8.12.0", - "cross-fetch": "^3.1.5" + "cross-fetch": "^3.1.5", + "encoding": "^0.1.13", + "typescript": "^4.9.4" }, "engines": { "node": ">=14.0.0" } }, "node_modules/@pinecone-database/pinecone/node_modules/@sinclair/typebox": { - "version": "0.28.20", - "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.28.20.tgz", - "integrity": "sha512-QCF3BGfacwD+3CKhGsMeixnwOmX4AWgm61nKkNdRStyLVu0mpVFYlDSY8gVBOOED1oSwzbJauIWl/+REj8K5+w==" + "version": "0.29.6", + "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.29.6.tgz", + "integrity": "sha512-aX5IFYWlMa7tQ8xZr3b2gtVReCvg7f3LEhjir/JAjX2bJCMVJA5tIPv30wTD4KDfcwMd7DDYY3hFDeGmOgtrZQ==" + }, + "node_modules/@pinecone-database/pinecone/node_modules/@types/node": { + "version": "18.19.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.3.tgz", + "integrity": "sha512-k5fggr14DwAytoA/t8rPrIz++lXK7/DqckthCmoZOKNsEbJkId4Z//BqgApXBUGrGddrigYa1oqheo/7YmW4rg==", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@pinecone-database/pinecone/node_modules/ajv": { "version": "8.12.0", @@ -677,6 +707,18 @@ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==" }, + "node_modules/@pinecone-database/pinecone/node_modules/typescript": { + "version": "4.9.5", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz", + "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -828,11 +870,6 @@ "integrity": "sha512-kNnC1GFBLuhImSnV7w4njQkUiJi0ZXUycu1rUaouPqiKlXkh77JKgdRnTAp1x5eBwcIwbtI+3otwzuIDEuDoxQ==", "dev": true }, - "node_modules/@types/web": { - "version": "0.0.99", - "resolved": "https://registry.npmjs.org/@types/web/-/web-0.0.99.tgz", - "integrity": "sha512-xMz3tOvtkZzc7RpQrDNiLe5sfMmP+fz8bOxHIZ/U8qXyvzDX4L4Ss1HCjor/O9DSelba+1iXK1VM7lruX28hiQ==" - }, "node_modules/@types/yargs": { "version": "17.0.24", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.24.tgz", @@ -1755,6 +1792,14 @@ "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" }, + "node_modules/encoding": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/encoding/-/encoding-0.1.13.tgz", + "integrity": "sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==", + "dependencies": { + "iconv-lite": "^0.6.2" + } + }, "node_modules/end-of-stream": { "version": "1.4.4", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", @@ -2708,6 +2753,17 @@ "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", "dev": true }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/ieee754": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", @@ -4470,6 +4526,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, "node_modules/semver": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", @@ -5201,6 +5262,11 @@ "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==", "dev": true }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", diff --git a/package.json b/package.json index 6e54cc1..4717888 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ "format:check": "npx prettier --check src" }, "dependencies": { - "@pinecone-database/pinecone": "^1.0.0", + "@pinecone-database/pinecone": "^1.1.2-spruceDev.20231211000839", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3", diff --git a/src/index.ts b/src/index.ts index a687d58..ad8b5bd 100644 --- a/src/index.ts +++ b/src/index.ts @@ -88,7 +88,7 @@ export const run = async () => { return parser.parse(); }; -// In case it is not test enviroment run automaticly +// In case it is not test enviroment run automatically /* c8 ignore start */ if (typeof vitest === "undefined") { run(); diff --git a/src/load.ts b/src/load.ts index b035c36..00faab8 100644 --- a/src/load.ts +++ b/src/load.ts @@ -39,17 +39,20 @@ export const load = async (csvPath: string, column: string) => { // Get index name const indexName = getEnv("PINECONE_INDEX"); - // Check whether the index already exists. If it doesn't, create - // a Pinecone index with a dimension of 384 to hold the outputs - // of our embeddings model. - const indexList = await pinecone.listIndexes(); - if (indexList.indexOf({ name: indexName }) === -1) { - await pinecone.createIndex({ - name: indexName, - dimension: 384, - waitUntilReady: true, - }); - } + // Create a Pinecone index with a dimension of 384 to hold the outputs + // of our embeddings model. Use suppressConflicts in case the index already exists. + await pinecone.createIndex({ + name: indexName, + dimension: 384, + spec: { + serverless: { + region: "us-west-2", + cloud: "aws", + }, + }, + waitUntilReady: true, + suppressConflicts: true, + }); // Select the target Pinecone index. Passing the TextMetadata generic type parameter // allows typescript to know what shape to expect when interacting with a record's diff --git a/tests/integration/deleteIndex.test.ts b/tests/integration/deleteIndex.test.ts index 46dfcda..664680d 100644 --- a/tests/integration/deleteIndex.test.ts +++ b/tests/integration/deleteIndex.test.ts @@ -15,14 +15,22 @@ describe("Delete", () => { try { const pinecone = new Pinecone(); - const indexList = await pinecone.listIndexes(); - if (indexList.indexOf({ name: INDEX_NAME }) === -1) { - await pinecone.createIndex({ name: INDEX_NAME, dimension: 384, waitUntilReady: true }) - } + await pinecone.createIndex({ + name: INDEX_NAME, + dimension: 384, + spec: { + serverless: { + region: "us-west-2", + cloud: "aws", + }, + }, + waitUntilReady: true, + suppressConflicts: true, + }); } catch (error) { console.error(error); } - }, // Set timeout to 5 mins, becouse creating index can take time + }, // Set timeout to 5 mins, because creating index can take time 5 * 60 * 1000 ); diff --git a/tests/integration/semanticSearch.test.ts b/tests/integration/semanticSearch.test.ts index 824f431..6531e22 100644 --- a/tests/integration/semanticSearch.test.ts +++ b/tests/integration/semanticSearch.test.ts @@ -1,4 +1,4 @@ -import { Pinecone } from '@pinecone-database/pinecone'; +import { Pinecone } from "@pinecone-database/pinecone"; import { run } from "@src/index.js"; import { createMockOnProcessExit, randomizeIndexName } from "../utils/index.js"; @@ -11,9 +11,9 @@ describe( // eslint-disable-next-line @typescript-eslint/no-empty-function const consoleMock = vi.spyOn(console, "error").mockImplementation(() => {}); - // In case our test fails it will be reruned. - // We whant to ensure that we are using new index but keep track of previus ones - // so we are able to clean after tests are done + // In case our test fails it will be rerun. + // We want to ensure that we are using new index, and we keep track + // of previous indexes so we can clean up when we're done. const createdIndexes: string[] = []; const setIndexName = (name: string) => { const indexName = randomizeIndexName(name); @@ -26,7 +26,7 @@ describe( const pinecone = new Pinecone(); const listIndexes = pinecone.listIndexes(); for (const indexName in listIndexes) { - await pinecone.deleteIndex(indexName) + await pinecone.deleteIndex(indexName); } }); @@ -67,12 +67,21 @@ describe( const pinecone = new Pinecone(); const index = pinecone.index(indexName); - const stats = await index - .describeIndexStats(); + let stats = await index.describeIndexStats(); + + // Records can take some time to become available in the index after upsert + // so we wait until the namespace is populated before moving on to asserts + while ( + (stats.namespaces && !stats.namespaces[""]) || + (stats.namespaces && stats.namespaces[""].recordCount === 0) + ) { + await new Promise((resolve) => setTimeout(resolve, 3000)); + stats = await index.describeIndexStats(); + } // Ensure that all vectors are added if (stats.namespaces) { - const defaultNamespaceStats = stats.namespaces[''] + const defaultNamespaceStats = stats.namespaces[""]; expect(defaultNamespaceStats.recordCount).toEqual(4); } expect(stats.totalRecordCount).toEqual(4); @@ -124,9 +133,11 @@ describe( process.argv = ["node", "../../src/index", "delete"]; await run(); - expect(consoleMock).toHaveBeenCalledWith(expect.stringContaining("PineconeNotFoundError")); + expect(consoleMock).toHaveBeenCalledWith( + expect.stringContaining("PineconeNotFoundError") + ); }); }, - // Set timeout to 5 mins, becouse creating index can take time + // Set timeout to 5 mins, because creating index can take time 5 * 60 * 1000 ); From 7561c19bd60043c52693d6c1f1483796e7ece846 Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Tue, 19 Dec 2023 11:06:42 -0500 Subject: [PATCH 2/6] remove PINECONE_ENVIRONMENT from README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 8bdce00..9b7849a 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,10 @@ Copy the template file: cp .env.example .env ``` -And fill in your API key and environment details: +And fill in your API key and index name: ```sh PINECONE_API_KEY= -PINECONE_ENVIRONMENT= PINECONE_INDEX=semantic-search ``` From 072d6d8a0cfae8479dea6fef67de5ad29f228dbb Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Tue, 19 Dec 2023 11:26:54 -0500 Subject: [PATCH 3/6] update README to reflect code changes in load.ts --- README.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9b7849a..8dca7a4 100644 --- a/README.md +++ b/README.md @@ -180,13 +180,20 @@ export const load = async (csvPath: string, column: string) => { // Get index name const indexName = getEnv("PINECONE_INDEX"); - // Check whether the index already exists. If it doesn't, create - // a Pinecone index with a dimension of 384 to hold the outputs - // of our embeddings model. - const indexList = await pinecone.listIndexes(); - if (indexList.indexOf({ name: indexName }) === -1) { - await pinecone.createIndex({ name: indexName, dimension: 384, waitUntilReady: true }) - } + // Create a Pinecone index with a dimension of 384 to hold the outputs + // of our embeddings model. Use suppressConflicts in case the index already exists. + await pinecone.createIndex({ + name: indexName, + dimension: 384, + spec: { + serverless: { + region: "us-west-2", + cloud: "aws", + }, + }, + waitUntilReady: true, + suppressConflicts: true, + }); // Select the target Pinecone index. Passing the TextMetadata generic type parameter // allows typescript to know what shape to expect when interacting with a record's From 6420237bf9e2ed79fa19c47f7984d4b11778ab39 Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Fri, 29 Dec 2023 14:42:52 -0500 Subject: [PATCH 4/6] fully remove PINECONE_ENVIRONMENT from github workflows and the env variable validator --- .env.example | 1 - .github/actions/integrationTests/action.yml | 4 ---- .github/workflows/regularCheck.yml | 3 --- .github/workflows/validate.yml | 3 --- src/utils/util.ts | 1 - 5 files changed, 12 deletions(-) diff --git a/.env.example b/.env.example index a9cd685..087eaf4 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,2 @@ PINECONE_API_KEY= -PINECONE_ENVIRONMENT= PINECONE_INDEX=semantic-search \ No newline at end of file diff --git a/.github/actions/integrationTests/action.yml b/.github/actions/integrationTests/action.yml index de97e76..84c8fec 100644 --- a/.github/actions/integrationTests/action.yml +++ b/.github/actions/integrationTests/action.yml @@ -4,9 +4,6 @@ inputs: pinecone_api_key: description: "API key" required: true - pinecone_environment: - description: "Environment/region to target" - required: true runs: using: "composite" steps: @@ -15,7 +12,6 @@ runs: env: CI: true PINECONE_API_KEY: ${{ inputs.pinecone_api_key }} - PINECONE_ENVIRONMENT: ${{ inputs.pinecone_environment }} PINECONE_INDEX: "semantic-search-testing" run: npm run test - name: "Report Coverage" diff --git a/.github/workflows/regularCheck.yml b/.github/workflows/regularCheck.yml index a86528a..76fde2f 100644 --- a/.github/workflows/regularCheck.yml +++ b/.github/workflows/regularCheck.yml @@ -8,8 +8,6 @@ on: secrets: PINECONE_API_KEY: required: true - PINECONE_ENVIRONMENT: - required: true jobs: run-integration-tests: name: Integration tests @@ -28,4 +26,3 @@ jobs: uses: ./.github/actions/integrationTests with: PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 7dbd1d5..2e503e2 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -7,8 +7,6 @@ on: secrets: PINECONE_API_KEY: required: true - PINECONE_ENVIRONMENT: - required: true jobs: basic-hygiene: @@ -49,4 +47,3 @@ jobs: uses: ./.github/actions/integrationTests with: PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} diff --git a/src/utils/util.ts b/src/utils/util.ts index 3e79728..1c1ce0a 100644 --- a/src/utils/util.ts +++ b/src/utils/util.ts @@ -14,7 +14,6 @@ function getEnv(key: string): string { const validateEnvironmentVariables = () => { getEnv("PINECONE_API_KEY"); - getEnv("PINECONE_ENVIRONMENT"); getEnv("PINECONE_INDEX"); }; From 80cb0755f59260339b65ef5cddc109434ada97dd Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Mon, 15 Jan 2024 16:59:48 -0500 Subject: [PATCH 5/6] update environment variables to support cloud and region, update associated readme examples --- .env.example | 4 +- .github/actions/integrationTests/action.yml | 2 + README.md | 81 +++++++++++---------- package-lock.json | 57 ++------------- package.json | 2 +- src/load.ts | 13 +++- src/utils/util.ts | 2 + 7 files changed, 65 insertions(+), 96 deletions(-) diff --git a/.env.example b/.env.example index 087eaf4..76880f9 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,4 @@ PINECONE_API_KEY= -PINECONE_INDEX=semantic-search \ No newline at end of file +PINECONE_INDEX="semantic-search" +PINECONE_CLOUD="aws" +PINECONE_REGION="us-west-2" \ No newline at end of file diff --git a/.github/actions/integrationTests/action.yml b/.github/actions/integrationTests/action.yml index 84c8fec..bef5b21 100644 --- a/.github/actions/integrationTests/action.yml +++ b/.github/actions/integrationTests/action.yml @@ -13,6 +13,8 @@ runs: CI: true PINECONE_API_KEY: ${{ inputs.pinecone_api_key }} PINECONE_INDEX: "semantic-search-testing" + PINECONE_CLOUD: "aws" + PINECONE_REGION: "us-west-2" run: npm run test - name: "Report Coverage" if: always() # Also generate the report if tests are failing diff --git a/README.md b/README.md index 8dca7a4..cea2284 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ In this walkthrough we will see how to use Pinecone for semantic search. ## Setup Prerequisites: + - `Node.js` version >=18.0.0 Clone the repository and install the dependencies. @@ -17,7 +18,7 @@ npm install ### Configuration -In order to run this example, you have to supply the Pinecone credentials needed to interact with the Pinecone API. You can find these credentials in the Pinecone web console. This project uses `dotenv` to easily load values from the `.env` file into the environment when executing. +In order to run this example, you have to supply the Pinecone credentials needed to interact with the Pinecone API. You can find these credentials in the Pinecone web console. This project uses `dotenv` to easily load values from the `.env` file into the environment when executing. Copy the template file: @@ -29,11 +30,15 @@ And fill in your API key and index name: ```sh PINECONE_API_KEY= -PINECONE_INDEX=semantic-search +PINECONE_INDEX="semantic-search" +PINECONE_CLOUD="aws" +PINECONE_REGION="us-west-2" ``` `PINECONE_INDEX` is the name of the index where this demo will store and query embeddings. You can change `PINECONE_INDEX` to any name you like, but make sure the name not going to collide with any indexes you are already using. +`PINECONE_CLOUD` and `PINECONE_REGION` define where the index should be deployed. Currently, this is the only available cloud and region combination (`aws` and `us-west-2`), so it's recommended to leave them defaulted. + ### Building To build the project please run the command: @@ -51,8 +56,8 @@ There are two main components to this application: the data loader (load.ts) and The data loading process starts with the CSV file. This file contains the articles that will be indexed and made searchable. To load this data, the project uses the `papaparse` library. The loadCSVFile function in `csvLoader.ts` reads the file and uses `papaparse` to parse the CSV data into JavaScript objects. The `dynamicTyping` option is set to true to automatically convert the data to the appropriate types. After this step, you will have an array of objects, where each object represents an article​. ```typescript -import fs from "fs/promises"; -import Papa from "papaparse"; +import fs from 'fs/promises'; +import Papa from 'papaparse'; async function loadCSVFile( filePath: string @@ -62,7 +67,7 @@ async function loadCSVFile( const csvAbsolutePath = await fs.realpath(filePath); // Create a readable stream from the CSV file - const data = await fs.readFile(csvAbsolutePath, "utf8"); + const data = await fs.readFile(csvAbsolutePath, 'utf8'); // Parse the CSV file return await Papa.parse(data, { @@ -84,19 +89,19 @@ export default loadCSVFile; The text embedding operation is performed in the `Embedder` class. This class uses a pipeline from the [`@xenova/transformers`](https://github.com/xenova/transformers.js) library to generate embeddings for the input text. We use the [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model to generate the embeddings. The class provides methods to embed a single string or an array of strings in batches​ - which will come in useful a bit later. ```typescript -import type { PineconeRecord } from "@pinecone-database/pinecone"; -import type { TextMetadata } from "./types.js"; -import { Pipeline } from "@xenova/transformers"; -import { v4 as uuidv4 } from "uuid"; -import { sliceIntoChunks } from "./utils/util.js"; +import type { PineconeRecord } from '@pinecone-database/pinecone'; +import type { TextMetadata } from './types.js'; +import { Pipeline } from '@xenova/transformers'; +import { v4 as uuidv4 } from 'uuid'; +import { sliceIntoChunks } from './utils/util.js'; class Embedder { private pipe: Pipeline | null = null; // Initialize the pipeline async init() { - const { pipeline } = await import("@xenova/transformers"); - this.pipe = await pipeline("embeddings", "Xenova/all-MiniLM-L6-v2"); + const { pipeline } = await import('@xenova/transformers'); + this.pipe = await pipeline('embeddings', 'Xenova/all-MiniLM-L6-v2'); } // Embed a single string @@ -131,7 +136,6 @@ class Embedder { const embedder = new Embedder(); export { embedder }; - ``` ## Loading embeddings into Pinecone @@ -139,15 +143,15 @@ export { embedder }; Now that we have a way to load data and create embeddings, let put the two together and save the embeddings in Pinecone. In the following section, we get the path of the file we need to process from the command like. We load the CSV file, create the Pinecone index and then start the embedding process. The embedding process is done in batches of 1000. Once we have a batch of embeddings, we insert them into the index. ```typescript -import cliProgress from "cli-progress"; -import { config } from "dotenv"; -import loadCSVFile from "./csvLoader.js"; +import cliProgress from 'cli-progress'; +import { config } from 'dotenv'; +import loadCSVFile from './csvLoader.js'; -import { embedder } from "./embeddings.js"; +import { embedder } from './embeddings.js'; import { Pinecone } from '@pinecone-database/pinecone'; -import { getEnv, validateEnvironmentVariables } from "./utils/util.js"; +import { getEnv, validateEnvironmentVariables } from './utils/util.js'; -import type { TextMetadata } from "./types.js"; +import type { TextMetadata } from './types.js'; // Load environment variables from .env config(); @@ -161,7 +165,7 @@ let counter = 0; export const load = async (csvPath: string, column: string) => { validateEnvironmentVariables(); - + // Get a Pinecone instance const pinecone = new Pinecone(); @@ -177,8 +181,10 @@ export const load = async (csvPath: string, column: string) => { // Extract the selected column from the CSV file const documents = data.map((row) => row[column] as string); - // Get index name - const indexName = getEnv("PINECONE_INDEX"); + // Get index name, cloud, and region + const indexName = getEnv('PINECONE_INDEX'); + const indexCloud = getEnv('PINECONE_CLOUD'); + const indexRegion = getEnv('PINECONE_REGION'); // Create a Pinecone index with a dimension of 384 to hold the outputs // of our embeddings model. Use suppressConflicts in case the index already exists. @@ -187,8 +193,8 @@ export const load = async (csvPath: string, column: string) => { dimension: 384, spec: { serverless: { - region: "us-west-2", - cloud: "aws", + region: indexRegion, + cloud: indexCloud, }, }, waitUntilReady: true, @@ -208,7 +214,7 @@ export const load = async (csvPath: string, column: string) => { await embedder.embedBatch(documents, 100, async (embeddings) => { counter += embeddings.length; // Whenever the batch embedding process returns a batch of embeddings, insert them into the index - await index.upsert(embeddings) + await index.upsert(embeddings); progressBar.update(counter); }); @@ -252,11 +258,11 @@ Index is ready. Now that our index is populated we can begin making queries. We are performing a semantic search for similar questions, so we should embed and search with another question. ```typescript -import { config } from "dotenv"; -import { embedder } from "./embeddings.js"; -import { Pinecone } from "@pinecone-database/pinecone"; -import { getEnv, validateEnvironmentVariables } from "./utils/util.js"; -import type { TextMetadata } from "./types.js"; +import { config } from 'dotenv'; +import { embedder } from './embeddings.js'; +import { Pinecone } from '@pinecone-database/pinecone'; +import { getEnv, validateEnvironmentVariables } from './utils/util.js'; +import type { TextMetadata } from './types.js'; config(); @@ -265,9 +271,9 @@ export const query = async (query: string, topK: number) => { const pinecone = new Pinecone(); // Target the index - const indexName = getEnv("PINECONE_INDEX"); + const indexName = getEnv('PINECONE_INDEX'); const index = pinecone.index(indexName); - + await embedder.init(); // Embed the query @@ -278,7 +284,7 @@ export const query = async (query: string, topK: number) => { vector: queryEmbedding.values, topK, includeMetadata: true, - includeValues: false + includeValues: false, }); // Print the results @@ -291,7 +297,6 @@ export const query = async (query: string, topK: number) => { })) ); }; - ``` The querying process is very similar to the indexing process. We create a Pinecone client, select the index we want to query, and then embed the query. We then use the `query` method to search the index for the most similar embeddings. The `query` method returns a list of matches. Each match contains the metadata associated with the embedding, as well as the score of the match. @@ -307,11 +312,11 @@ The result for this will be something like: ```js [ { - text: "Which country in the world has the largest population?", + text: 'Which country in the world has the largest population?', score: 0.79473877, }, { - text: "Which cities are the most densely populated?", + text: 'Which cities are the most densely populated?', score: 0.706895828, }, ]; @@ -328,11 +333,11 @@ And the result: ```js [ { - text: "Which cities are the most densely populated?", + text: 'Which cities are the most densely populated?', score: 0.66688776, }, { - text: "What are the most we dangerous cities in the world?", + text: 'What are the most we dangerous cities in the world?', score: 0.556335568, }, ]; diff --git a/package-lock.json b/package-lock.json index c7a44d5..fa4a386 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "0.0.1", "license": "MIT", "dependencies": { - "@pinecone-database/pinecone": "^1.1.2-spruceDev.20231211000839", + "@pinecone-database/pinecone": "^1.1.3-spruceDev.20240115214739", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3", @@ -93,25 +93,6 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, - "node_modules/@edge-runtime/primitives": { - "version": "4.0.5", - "resolved": "https://registry.npmjs.org/@edge-runtime/primitives/-/primitives-4.0.5.tgz", - "integrity": "sha512-t7QiN5d/KpXgCvIfSt6Nm9Hj3WVdNgc5CpOD73jasY+9EvTI7Ngdj5cXvjcHrPcmYWJZMySPgeEeoL/1N/Llag==", - "engines": { - "node": ">=16" - } - }, - "node_modules/@edge-runtime/types": { - "version": "2.2.7", - "resolved": "https://registry.npmjs.org/@edge-runtime/types/-/types-2.2.7.tgz", - "integrity": "sha512-9MTwGooICP7+ZsX9BTy6YCRzOr4tP6RFRymsc8CaKORfvuAHgLZUQaLwILfQ94tddufVXcBwq637VfEd3ZXbWA==", - "dependencies": { - "@edge-runtime/primitives": "4.0.5" - }, - "engines": { - "node": ">=16" - } - }, "node_modules/@esbuild/android-arm": { "version": "0.18.20", "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.18.20.tgz", @@ -658,17 +639,14 @@ } }, "node_modules/@pinecone-database/pinecone": { - "version": "1.1.2-spruceDev.20231211000839", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.1.2-spruceDev.20231211000839.tgz", - "integrity": "sha512-id2Wau1eAP4Eh+CS+4Dg41Zh1gsSG828Li8V4FueHRS81WuIZ0WE37HENG6FUVijVeteG8y19I360rqYJEceHQ==", + "version": "1.1.3-spruceDev.20240115214739", + "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.1.3-spruceDev.20240115214739.tgz", + "integrity": "sha512-R3VCHgf5Qu2qbfoGGqTTWdVU6Y1Fo66sQD6lrv001hoaEHSIDn96WrQxuaOzgekCkT5xMZsat2p18gtTvBcigw==", "dependencies": { - "@edge-runtime/types": "^2.2.3", "@sinclair/typebox": "^0.29.0", - "@types/node": "^18.11.17", "ajv": "^8.12.0", "cross-fetch": "^3.1.5", - "encoding": "^0.1.13", - "typescript": "^4.9.4" + "encoding": "^0.1.13" }, "engines": { "node": ">=14.0.0" @@ -679,14 +657,6 @@ "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.29.6.tgz", "integrity": "sha512-aX5IFYWlMa7tQ8xZr3b2gtVReCvg7f3LEhjir/JAjX2bJCMVJA5tIPv30wTD4KDfcwMd7DDYY3hFDeGmOgtrZQ==" }, - "node_modules/@pinecone-database/pinecone/node_modules/@types/node": { - "version": "18.19.3", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.3.tgz", - "integrity": "sha512-k5fggr14DwAytoA/t8rPrIz++lXK7/DqckthCmoZOKNsEbJkId4Z//BqgApXBUGrGddrigYa1oqheo/7YmW4rg==", - "dependencies": { - "undici-types": "~5.26.4" - } - }, "node_modules/@pinecone-database/pinecone/node_modules/ajv": { "version": "8.12.0", "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz", @@ -707,18 +677,6 @@ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==" }, - "node_modules/@pinecone-database/pinecone/node_modules/typescript": { - "version": "4.9.5", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz", - "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==", - "bin": { - "tsc": "bin/tsc", - "tsserver": "bin/tsserver" - }, - "engines": { - "node": ">=4.2.0" - } - }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -5262,11 +5220,6 @@ "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==", "dev": true }, - "node_modules/undici-types": { - "version": "5.26.5", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" - }, "node_modules/uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", diff --git a/package.json b/package.json index 4717888..a4dc39c 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ "format:check": "npx prettier --check src" }, "dependencies": { - "@pinecone-database/pinecone": "^1.1.2-spruceDev.20231211000839", + "@pinecone-database/pinecone": "^1.1.3-spruceDev.20240115214739", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3", diff --git a/src/load.ts b/src/load.ts index 00faab8..fd31456 100644 --- a/src/load.ts +++ b/src/load.ts @@ -3,7 +3,10 @@ import { config } from "dotenv"; import loadCSVFile from "./csvLoader.js"; import { embedder } from "./embeddings.js"; -import { Pinecone } from "@pinecone-database/pinecone"; +import { + Pinecone, + type ServerlessSpecCloudEnum, +} from "@pinecone-database/pinecone"; import { getEnv, validateEnvironmentVariables } from "./utils/util.js"; import type { TextMetadata } from "./types.js"; @@ -36,8 +39,10 @@ export const load = async (csvPath: string, column: string) => { // Extract the selected column from the CSV file const documents = data.map((row) => row[column] as string); - // Get index name + // Get index name, cloud, and region const indexName = getEnv("PINECONE_INDEX"); + const indexCloud = getEnv("PINECONE_CLOUD") as ServerlessSpecCloudEnum; + const indexRegion = getEnv("PINECONE_REGION"); // Create a Pinecone index with a dimension of 384 to hold the outputs // of our embeddings model. Use suppressConflicts in case the index already exists. @@ -46,8 +51,8 @@ export const load = async (csvPath: string, column: string) => { dimension: 384, spec: { serverless: { - region: "us-west-2", - cloud: "aws", + region: indexRegion, + cloud: indexCloud, }, }, waitUntilReady: true, diff --git a/src/utils/util.ts b/src/utils/util.ts index 1c1ce0a..651688b 100644 --- a/src/utils/util.ts +++ b/src/utils/util.ts @@ -15,6 +15,8 @@ function getEnv(key: string): string { const validateEnvironmentVariables = () => { getEnv("PINECONE_API_KEY"); getEnv("PINECONE_INDEX"); + getEnv("PINECONE_CLOUD"); + getEnv("PINECONE_REGION"); }; export { getEnv, sliceIntoChunks, validateEnvironmentVariables }; From 2ffca08d35a034aaa704cc406ef11c4651eb467d Mon Sep 17 00:00:00 2001 From: Austin DeNoble Date: Tue, 16 Jan 2024 08:27:02 -0500 Subject: [PATCH 6/6] bump to pinecone-database v2 --- package-lock.json | 8 ++++---- package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index fa4a386..2fba967 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "0.0.1", "license": "MIT", "dependencies": { - "@pinecone-database/pinecone": "^1.1.3-spruceDev.20240115214739", + "@pinecone-database/pinecone": "^2.0.0", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3", @@ -639,9 +639,9 @@ } }, "node_modules/@pinecone-database/pinecone": { - "version": "1.1.3-spruceDev.20240115214739", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.1.3-spruceDev.20240115214739.tgz", - "integrity": "sha512-R3VCHgf5Qu2qbfoGGqTTWdVU6Y1Fo66sQD6lrv001hoaEHSIDn96WrQxuaOzgekCkT5xMZsat2p18gtTvBcigw==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-2.0.0.tgz", + "integrity": "sha512-deDcyJNqqgbww+nt3asqkdsTTASYOdVwfqlRmumKKT+v9760+tupRoUVPWqZ1+3LoVyvC+03DfhulQQHz6rC3Q==", "dependencies": { "@sinclair/typebox": "^0.29.0", "ajv": "^8.12.0", diff --git a/package.json b/package.json index a4dc39c..673bdc7 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ "format:check": "npx prettier --check src" }, "dependencies": { - "@pinecone-database/pinecone": "^1.1.3-spruceDev.20240115214739", + "@pinecone-database/pinecone": "^2.0.0", "@xenova/transformers": "2.0.1", "cli-progress": "^3.12.0", "dotenv": "^16.0.3",