diff --git a/.github/workflows/ci_code.yml b/.github/workflows/ci_code.yml index 952c915d4..078a306c3 100644 --- a/.github/workflows/ci_code.yml +++ b/.github/workflows/ci_code.yml @@ -65,6 +65,7 @@ jobs: # TODO: We currently need a default plugin to run tests using MongoDB. # Once the local file database is complete, we may need to update this section. python -m pip install plugins/mongodb + python -m pip install plugins/openai python -m pip install plugins/ibis python -m pip install plugins/sqlalchemy @@ -85,5 +86,6 @@ jobs: - name: Usecase Testing run: | + cp -r templates/* superduper/templates/ make usecase_testing SUPERDUPER_CONFIG=test/configs/default.yaml make usecase_testing SUPERDUPER_CONFIG=test/configs/sql.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index a1f401f95..b754954b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 **Before you create a Pull Request, remember to update the Changelog with your changes.** -## Changes Since Last Release +## Changes Since Last Release #### Changed defaults / behaviours diff --git a/pyproject.toml b/pyproject.toml index d1e41407f..2047fb5c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ ignore = [ "D102", "E402", ] +exclude = ["templates", "superduper/templates"] [tool.ruff.lint.isort] combine-as-imports = true diff --git a/superduper/base/document.py b/superduper/base/document.py index 42e765961..0d1ae460b 100644 --- a/superduper/base/document.py +++ b/superduper/base/document.py @@ -191,8 +191,9 @@ def decode( :param db: The datalayer to use. """ if '_variables' in r: + variables = {**r['_variables'], 'output_prefix': CFG.output_prefix} r = _replace_variables( - {k: v for k, v in r.items() if k != '_variables'}, **r['_variables'] + {k: v for k, v in r.items() if k != '_variables'}, **variables ) schema = schema or r.get(KEY_SCHEMA) schema = get_schema(db, schema) @@ -216,6 +217,7 @@ def decode( if not isinstance(getters, _Getters): getters = _Getters(getters) + assert isinstance(getters, _Getters) # Prioritize using the local artifact storage getter, # and then use the DB read getter. diff --git a/superduper/cli/main.py b/superduper/cli/main.py index de133aa8f..f8ac3a1b8 100644 --- a/superduper/cli/main.py +++ b/superduper/cli/main.py @@ -176,6 +176,7 @@ def _apply(name: str, variables: str | None = None, data_backend: str | None = N variables = variables or '{}' variables = json.loads(variables) + # TODO remove all of this template logic def _build_from_template(t): assert variables is not None, 'Variables must be provided for templates' all_values = variables.copy() diff --git a/superduper/components/template.py b/superduper/components/template.py index c682ccd2c..4d647dae7 100644 --- a/superduper/components/template.py +++ b/superduper/components/template.py @@ -80,7 +80,6 @@ def default_values(self): def form_template(self): """Form to be diplayed to user.""" return { - 'identifier': '', '_variables': { k: ( f'' diff --git a/superduper/rest/build.py b/superduper/rest/build.py index 9da37d44b..a051855e8 100644 --- a/superduper/rest/build.py +++ b/superduper/rest/build.py @@ -18,7 +18,6 @@ from superduper import logging from superduper.backends.base.query import Query from superduper.base.document import Document -from superduper.components.component import Component from superduper.components.template import Template from superduper.rest.base import DatalayerDependency, SuperDuperApp @@ -169,31 +168,6 @@ def _process_db_apply(db, component, id: str | None = None): else: db.apply(component, force=True) - def _process_apply_info(db, info): - if '_variables' in info: - assert {'_variables', 'identifier'}.issubset(info.keys()) - variables = info.pop('_variables') - for k in variables: - if isinstance(variables[k], str): - assert '<' not in variables[k] - assert '>' not in variables[k] - - identifier = info.pop('identifier') - template_name = info.pop('_template_name', None) - - component = Component.from_template( - identifier=identifier, - template_body=info, - template_name=template_name, - db=db, - **variables, - ) - return component - component = Document.decode(info, db=db).unpack() - # TODO this shouldn't be necessary to do twice - component.unpack() - return component - @app.add('/db/apply', method='post') async def db_apply( info: t.Dict, @@ -201,7 +175,7 @@ async def db_apply( id: str | None = 'test', db: 'Datalayer' = DatalayerDependency(), ): - component = _process_apply_info(db, info) + component = Document.decode(info, db=db).unpack() background_tasks.add_task(_process_db_apply, db, component, id) return {'status': 'ok'} diff --git a/templates/pdf_rag/streamlit.py b/templates/pdf_rag/streamlit.py index ae723c4a5..6d1a36be1 100644 --- a/templates/pdf_rag/streamlit.py +++ b/templates/pdf_rag/streamlit.py @@ -42,10 +42,7 @@ def init_db(): def load_questions(): - return [ - "What is sparse-vector retrieval?", - "How to perform Query Optimization?" - ] + return ["What is sparse-vector retrieval?", "How to perform Query Optimization?"] db, model_rag = st.cache_resource(init_db)() diff --git a/test/integration/usecase/test_build_interface.py b/test/integration/usecase/test_build_interface.py new file mode 100644 index 000000000..098d47088 --- /dev/null +++ b/test/integration/usecase/test_build_interface.py @@ -0,0 +1,22 @@ +import json + +import pytest + +from superduper import Application, Document + + +@pytest.mark.skip +def test_build_from_template(db): + from superduper import templates + + db.apply(templates.simple_rag) + + with open('test/material/sample_app/component.json') as f: + component = json.load(f) + + component = templates.simple_rag.form_template + component['_variables']['output_prefix'] = '_output__' + + c = Document.decode(component, db=db).unpack() + + assert isinstance(c, Application) diff --git a/test/material/sample_app/component.json b/test/material/sample_app/component.json new file mode 100644 index 000000000..3a39355e7 --- /dev/null +++ b/test/material/sample_app/component.json @@ -0,0 +1,246 @@ +{ + "_variables": { + "table_name": "sample_simple_rag", + "id_field": "_id", + "databackend": "mongodb", + "base_url": null, + "api_key": null, + "embedding_model": "text-embedding-ada-002", + "llm_model": "gpt-3.5-turbo" + }, + "types": { + "id_field": { + "type": "str", + "default": "_id" + }, + "embedding_model": { + "type": "str", + "default": "text-embedding-ada-002" + }, + "llm_model": { + "type": "str", + "default": "gpt-3.5-turbo" + }, + "table_name": { + "type": "str", + "default": "sample_simple_rag" + }, + "databackend": { + "type": "str", + "default": "mongodb" + }, + "base_url": { + "type": "str", + "optional": true, + "default": null + }, + "api_key": { + "type": "str", + "optional": true, + "default": null + } + }, + "_base": "?simple-rag-app", + "_builds": { + "datatype:dill": { + "_path": "superduper.components.datatype.get_serializer", + "method": "dill", + "encodable": "artifact" + }, + "727d3bb560939e1211f9cac189d56e07e9622eeb": { + "_path": "superduper.components.datatype.Artifact", + "datatype": "?datatype:dill", + "uri": null, + "blob": "&:blob:727d3bb560939e1211f9cac189d56e07e9622eeb" + }, + "model:chunker": { + "_object": "?727d3bb560939e1211f9cac189d56e07e9622eeb", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "signature": "singleton", + "datatype": null, + "output_schema": null, + "model_update_kwargs": {}, + "predict_kwargs": {}, + "compute_kwargs": {}, + "validation": null, + "metric_values": {}, + "num_workers": 0, + "serve": false, + "trainer": null, + "deploy": false, + "chunk_size": 200 + }, + "var-table-name-select-var-id-field-x": { + "_path": "superduper_.query.parse_query", + "documents": [], + "query": ".select(\"\", \"x\")" + }, + "listener:chunker": { + "_path": "superduper.components.listener.Listener", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "cdc_table": "", + "key": "x", + "model": "?model:chunker", + "predict_kwargs": {}, + "select": "?var-table-name-select-var-id-field-x", + "flatten": true + }, + "datatype:sqlvector[1536]": { + "_path": "superduper.components.vector_index.sqlvector", + "shape": [ + 1536 + ] + }, + "model:": { + "_path": "superduper_openai.model.OpenAIEmbedding", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "signature": "singleton", + "datatype": "?datatype:sqlvector[1536]", + "output_schema": null, + "model_update_kwargs": {}, + "predict_kwargs": {}, + "compute_kwargs": {}, + "validation": null, + "metric_values": {}, + "num_workers": 0, + "serve": false, + "trainer": null, + "deploy": false, + "model": "", + "max_batch_size": 8, + "openai_api_key": null, + "openai_api_base": null, + "client_kwargs": { + "base_url": null, + "api_key": null + }, + "shape": [ + 1536 + ], + "batch_size": 100 + }, + "outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)": { + "_path": "superduper_.query.parse_query", + "documents": [], + "query": "chunker__?(listener:chunker.uuid).select(\"id\", \"_source\", \"chunker__?(listener:chunker.uuid)\")" + }, + "listener:embeddinglistener": { + "_path": "superduper.components.listener.Listener", + "upstream": [ + "?listener:chunker", + "?listener:chunker" + ], + "plugins": null, + "cache": true, + "status": null, + "cdc_table": "chunker__?(listener:chunker.uuid)", + "key": "chunker__?(listener:chunker.uuid)", + "model": "?model:", + "predict_kwargs": {}, + "select": "?outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)", + "flatten": false + }, + "vector_index:vectorindex": { + "_path": "superduper.components.vector_index.VectorIndex", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "cdc_table": "embeddinglistener__?(listener:embeddinglistener.uuid)", + "indexing_listener": "?listener:embeddinglistener", + "compatible_listener": null, + "measure": "cosine", + "metric_values": {} + }, + "outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5": { + "_path": "superduper_.query.parse_query", + "documents": [ + { + "chunker__?(listener:chunker.uuid)": "" + } + ], + "query": "chunker__?(listener:chunker.uuid).select().like(documents[0], vector_index=\"vectorindex\", n=5)" + }, + "model:llm-model": { + "_path": "superduper_openai.model.OpenAIChatCompletion", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "signature": "singleton", + "datatype": null, + "output_schema": null, + "model_update_kwargs": {}, + "predict_kwargs": {}, + "compute_kwargs": {}, + "validation": null, + "metric_values": {}, + "num_workers": 0, + "serve": false, + "trainer": null, + "deploy": false, + "model": "", + "max_batch_size": 8, + "openai_api_key": null, + "openai_api_base": null, + "client_kwargs": { + "base_url": null, + "api_key": null + }, + "batch_size": 1, + "prompt": "" + }, + "model:simple_rag": { + "_path": "superduper.components.model.RAGModel", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "signature": "singleton", + "datatype": null, + "output_schema": null, + "model_update_kwargs": {}, + "predict_kwargs": {}, + "compute_kwargs": {}, + "validation": null, + "metric_values": {}, + "num_workers": 0, + "serve": false, + "trainer": null, + "deploy": false, + "prompt_template": "Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\nThese snippets are samples from our internal data-repositories, and should be used exclusively and as a matter of priority to answer the question\n\n{context}\n\nHere's the question: {query}", + "select": "?outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5", + "key": "chunker__?(listener:chunker.uuid)", + "llm": "?model:llm-model" + }, + "simple-rag-app": { + "_path": "superduper.components.application.Application", + "upstream": null, + "plugins": null, + "cache": true, + "status": null, + "components": [ + "?listener:chunker", + "?vector_index:vectorindex", + "?model:simple_rag" + ], + "namespace": null, + "link": null, + "_literals": [ + "template" + ] + } + }, + "_blobs": {}, + "_files": {}, + "_template_name": "simple_rag" +}