mitdbg · mdr223 · Feb 3, 2025 · Jan 30, 2025 · Feb 1, 2025 · Feb 3, 2025
diff --git a/README.md b/README.md
@@ -57,8 +57,8 @@ class Email(Schema):
 
 # Lazy construction of computation to filter for emails about holidays sent in July
 dataset = dataset.convert(Email, desc="An email from the Enron dataset")
-dataset = dataset.filter("The email was sent in July")
-dataset = dataset.filter("The email is about holidays")
+dataset = dataset.sem_filter("The email was sent in July")
+dataset = dataset.sem_filter("The email is about holidays")
 
 # Executing the compuation
 policy = MinCost()

diff --git a/demos/askem-var.py b/demos/askem-var.py
@@ -53,7 +53,7 @@ class Variable(Schema):
         excerpts = Dataset(df_input, schema=Papersnippet)
         output = excerpts.convert(
             Variable, desc="A variable used or introduced in the context", cardinality=Cardinality.ONE_TO_MANY
-        ).filter("The value name is 'a'", depends_on="name")
+        ).sem_filter("The value name is 'a'", depends_on="name")
         policy = MaxQuality()
         config = QueryProcessorConfig(
             policy=policy,

diff --git a/demos/bdf-suite.py b/demos/bdf-suite.py
@@ -133,7 +133,7 @@ def extract_supplemental(processing_strategy, execution_strategy, optimizer_stra
 def integrate_tables(processing_strategy, execution_strategy, optimizer_strategy, policy):
     xls = Dataset("biofabric-tiny", schema=XLSFile)
     patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
-    patient_tables = patient_tables.filter("The table contains biometric information about the patient")
+    patient_tables = patient_tables.sem_filter("The table contains biometric information about the patient")
     case_data = patient_tables.convert(
         CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY
     )
@@ -162,7 +162,7 @@ def integrate_tables(processing_strategy, execution_strategy, optimizer_strategy
 @st.cache_resource()
 def extract_references(processing_strategy, execution_strategy, optimizer_strategy, policy):
     papers = Dataset("bdf-usecase3-tiny", schema=ScientificPaper)
-    papers = papers.filter("The paper mentions phosphorylation of Exo1")
+    papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
     references = papers.convert(
         Reference, desc="A paper cited in the reference section", cardinality=Cardinality.ONE_TO_MANY
     )
@@ -204,7 +204,7 @@ def extract_references(processing_strategy, execution_strategy, optimizer_strate
 if run_pz:
     # reference, plan, stats = run_workload()
     papers = Dataset(dataset, schema=ScientificPaper)
-    papers = papers.filter("The paper mentions phosphorylation of Exo1")
+    papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
     output = papers.convert(Reference, desc="The references cited in the paper", cardinality=Cardinality.ONE_TO_MANY)
 
     # output = references

diff --git a/demos/bdf-usecase3.py b/demos/bdf-usecase3.py
@@ -55,7 +55,7 @@ class Reference(Schema):
 @st.cache_resource()
 def run_workload():
     papers = Dataset("bdf-usecase3-tiny", schema=ScientificPaper)
-    # papers = papers.filter("The paper mentions phosphorylation of Exo1")
+    # papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
     references = papers.convert(
         Reference, desc="A paper cited in the reference section", cardinality=Cardinality.ONE_TO_MANY
     )
@@ -104,7 +104,7 @@ def run_workload():
 if run_pz:
     # reference, plan, stats = run_workload()
     papers = Dataset(dataset, schema=ScientificPaper)
-    papers = papers.filter("The paper mentions phosphorylation of Exo1")
+    papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
     output = papers.convert(Reference, desc="The references cited in the paper", cardinality=Cardinality.ONE_TO_MANY)
 
     # output = references

diff --git a/demos/biofabric-demo-matching.ipynb b/demos/biofabric-demo-matching.ipynb
@@ -219,7 +219,7 @@
     "\n",
     "xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
     "patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
-    "patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
+    "patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
     "\n",
     "output = patient_tables\n",
     "\n",
@@ -1113,7 +1113,7 @@
     "\n",
     "xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
     "patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
-    "patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
+    "patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
     "case_data = patient_tables.convert(CaseData, desc=\"The patient data in the table\", cardinality=Cardinality.ONE_TO_MANY)\n",
     "\n",
     "policy = MinCost()\n",
@@ -1604,7 +1604,7 @@
     "\n",
     "xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
     "patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
-    "patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
+    "patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
     "case_data = patient_tables.convert(CaseData, desc=\"The patient data in the table\",cardinality=Cardinality.ONE_TO_MANY)\n",
     "\n",
     "policy = MaxQuality()\n",

diff --git a/demos/biofabric-demo.py b/demos/biofabric-demo.py
@@ -104,11 +104,11 @@ def print_table(output):
     elif experiment == "filtering":
         xls = Dataset("biofabric-tiny", schema=XLSFile)
         patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
-        patient_tables = patient_tables.filter("The rows of the table contain the patient age")
-        # patient_tables = patient_tables.filter("The table explains the meaning of attributes")
-        # patient_tables = patient_tables.filter("The table contains patient biometric data")
-        # patient_tables = patient_tables.filter("The table contains proteomic data")
-        # patient_tables = patient_tables.filter("The table records if the patient is excluded from the study")
+        patient_tables = patient_tables.sem_filter("The rows of the table contain the patient age")
+        # patient_tables = patient_tables.sem_filter("The table explains the meaning of attributes")
+        # patient_tables = patient_tables.sem_filter("The table contains patient biometric data")
+        # patient_tables = patient_tables.sem_filter("The table contains proteomic data")
+        # patient_tables = patient_tables.sem_filter("The table records if the patient is excluded from the study")
         output = patient_tables
 
     elif experiment == "matching":
@@ -122,7 +122,7 @@ def print_table(output):
     elif experiment == "endtoend":
         xls = Dataset("biofabric-tiny", schema=XLSFile)
         patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
-        patient_tables = patient_tables.filter("The rows of the table contain the patient age")
+        patient_tables = patient_tables.sem_filter("The rows of the table contain the patient age")
         case_data = patient_tables.convert(
             CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY
         )

diff --git a/demos/demo_core.py b/demos/demo_core.py
@@ -45,8 +45,8 @@ def build_test_pdf_plan(dataset_id):
 def build_mit_battery_paper_plan(dataset_id):
     """A dataset-independent declarative description of authors of good papers"""
     sci_papers = Dataset(dataset_id, schema=ScientificPaper)
-    battery_papers = sci_papers.filter("The paper is about batteries")
-    mit_papers = battery_papers.filter("The paper is from MIT")
+    battery_papers = sci_papers.sem_filter("The paper is about batteries")
+    mit_papers = battery_papers.sem_filter("The paper is from MIT")
     return mit_papers
 
 def build_enron_plan(dataset_id):
@@ -111,15 +111,15 @@ def build_image_plan(dataset_id):
     """Build a plan for processing dog images"""
     from palimpzest.sets import Dataset
     images = Dataset(dataset_id, schema=ImageFile)
-    filtered_images = images.filter("The image contains one or more dogs")
+    filtered_images = images.sem_filter("The image contains one or more dogs")
     dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
     return dog_images
 
 def build_image_agg_plan(dataset_id):
     """Build a plan for aggregating dog images by breed"""
     from palimpzest.sets import Dataset
     images = Dataset(dataset_id, schema=ImageFile)
-    filtered_images = images.filter("The image contains one or more dogs")
+    filtered_images = images.sem_filter("The image contains one or more dogs")
     dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
     ops = ["count"]
     fields = ["breed"]

diff --git a/demos/df-newinterface.py b/demos/df-newinterface.py
@@ -1,21 +1,17 @@
 import pandas as pd
 
 import palimpzest as pz
-from palimpzest.query.processor.config import QueryProcessorConfig
 
 df = pd.read_csv("testdata/enron-tiny.csv")
 qr2 = pz.Dataset(df)
-qr2 = qr2.add_columns({"sender": ("The email address of the sender", "string"), 
-                        "subject": ("The subject of the email", "string"),#
-                        "date": ("The date the email was sent", "string")})
-qr3 = qr2.filter("It is an email").filter("It has Vacation in the subject")
+qr2 = qr2.sem_add_columns([
+    {"name" : "sender", "desc" : "The email address of the sender", "type" : "string"}, 
+    {"name" : "subject", "desc" : "The subject of the email", "type" : "string"},
+    {"name" : "date", "desc" : "The date the email was sent", "type" : "string"}
+])
 
-config = QueryProcessorConfig(
-    verbose=True,
-    execution_strategy="pipelined_parallel",
-)
-
-output = qr3.run(config)
+qr3 = qr2.sem_filter("It is an email").sem_filter("It has Vacation in the subject")
+output = qr3.run()
 output_df = output.to_df()
 print(output_df)
 

diff --git a/demos/image-demo.py b/demos/image-demo.py
@@ -28,7 +28,7 @@ class DogImage(ImageFile):
 
 def build_image_plan(dataset_id):
     images = Dataset(dataset_id, schema=ImageFile)
-    filtered_images = images.filter("The image contains one or more dogs")
+    filtered_images = images.sem_filter("The image contains one or more dogs")
     dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
     return dog_images
 

diff --git a/demos/optimizer-demo.py b/demos/optimizer-demo.py
@@ -882,10 +882,10 @@ def get_item(self, idx: int, val: bool = False, include_label: bool = False):
         DataDirectory().register_user_source(src=datasource, dataset_id=user_dataset_id)
 
         plan = Dataset(user_dataset_id, schema=Email)
-        plan = plan.filter(
+        plan = plan.sem_filter(
             "The email is not quoting from a news article or an article written by someone outside of Enron"
         )
-        plan = plan.filter(
+        plan = plan.sem_filter(
             'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")'
         )
 
@@ -910,7 +910,7 @@ def get_item(self, idx: int, val: bool = False, include_label: bool = False):
         plan = Dataset(user_dataset_id, schema=RealEstateListingFiles)
         plan = plan.convert(TextRealEstateListing, depends_on="text_content")
         plan = plan.convert(ImageRealEstateListing, depends_on="image_filepaths")
-        plan = plan.filter(
+        plan = plan.sem_filter(
             "The interior is modern and attractive, and has lots of natural sunlight",
             depends_on=["is_modern_and_attractive", "has_natural_sunlight"],
         )

diff --git a/demos/paper-demo.py b/demos/paper-demo.py
@@ -213,10 +213,10 @@ def get_item(self, idx: int):
     if workload == "enron":
         # datasetid="enron-eval" for paper evaluation
         plan = Dataset(datasetid, schema=Email)
-        plan = plan.filter(
+        plan = plan.sem_filter(
             "The email is not quoting from a news article or an article written by someone outside of Enron"
         )
-        plan = plan.filter(
+        plan = plan.sem_filter(
             'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")'
         )
 
@@ -231,7 +231,7 @@ def get_item(self, idx: int):
         plan = Dataset(user_dataset_id, schema=RealEstateListingFiles)
         plan = plan.convert(TextRealEstateListing, depends_on="text_content")
         plan = plan.convert(ImageRealEstateListing, depends_on="image_filepaths")
-        plan = plan.filter(
+        plan = plan.sem_filter(
             "The interior is modern and attractive, and has lots of natural sunlight",
             depends_on=["is_modern_and_attractive", "has_natural_sunlight"],
         )
@@ -242,7 +242,7 @@ def get_item(self, idx: int):
         # datasetid="biofabric-medium" for paper evaluation
         plan = Dataset(datasetid, schema=XLSFile)
         plan = plan.convert(Table, udf=xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
-        plan = plan.filter("The rows of the table contain the patient age")
+        plan = plan.sem_filter("The rows of the table contain the patient age")
         plan = plan.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)
 
     config = QueryProcessorConfig(

diff --git a/docs/source/chat.rst b/docs/source/chat.rst
@@ -0,0 +1,8 @@
+PalimpChat
+======
+
+PalimpChat is an easy way to get started building Palimpzest pipelines through a chat interface using reasoning agents.
+To access our demo please go to this webpage `Palimpchat <http://3.213.4.62:8888/>`_.
+
+You can view a video `here <https://people.csail.mit.edu/chunwei/demo/palimpchat.mp4>`_.
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -16,7 +16,7 @@
 project = "Palimpzest"
 copyright = "2025, MIT Data Systems Group"
 author = "MIT Data Systems Group"
-release = "0.5.2"
+release = "0.5.3"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -73,8 +73,8 @@ For eager readers, the code in the notebook can be found in the following conden
 
    # Lazy construction of computation to filter for emails about holidays sent in July
    dataset = dataset.convert(Email, desc="An email from the Enron dataset")
-   dataset = dataset.filter("The email was sent in July")
-   dataset = dataset.filter("The email is about holidays")
+   dataset = dataset.sem_filter("The email was sent in July")
+   dataset = dataset.sem_filter("The email is about holidays")
 
    # Executing the compuation
    policy = MinCost()

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "0.5.2"  # if you update this, be sure to update package version in `docs/source/conf.py` as well
+version = "0.5.3"  # if you update this, be sure to update package version in `docs/source/conf.py` as well
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.8"

diff --git a/quickstart.ipynb b/quickstart.ipynb
@@ -157,7 +157,7 @@
     "### Step 3: Apply a Filter to the Emails\n",
     "Now that we have the emails in the dataset, we can filter them to only retain the ones that mention a vacation plan and were sent in the month of July.\n",
     "\n",
-    "To do this, we will use the `filter` function. This function takes a string which describes in natural language which condition we want the records to satisfy to pass the filter.\n",
+    "To do this, we will use the `sem_filter` function. This function takes a string which describes in natural language which condition we want the records to satisfy to pass the filter.\n",
     "\n",
     "When using natural language, you don't need to worry about implementing the filter itself, but the computation will be performed by LLM models. Such is the power of Palimpzest! "
    ]
@@ -168,8 +168,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = dataset.filter(\"The email was sent in July\")\n",
-    "dataset = dataset.filter(\"The email is about holidays\")"
+    "dataset = dataset.sem_filter(\"The email was sent in July\")\n",
+    "dataset = dataset.sem_filter(\"The email is about holidays\")"
    ]
   },
   {

diff --git a/src/palimpzest/core/lib/schemas.py b/src/palimpzest/core/lib/schemas.py
@@ -18,6 +18,7 @@
     NumericField,
     StringField,
 )
+from palimpzest.utils.field_helpers import construct_field_from_python_type
 from palimpzest.utils.hash_helpers import hash_for_temp_schema
 
 
@@ -276,15 +277,21 @@ def from_df(df: pd.DataFrame) -> Schema:
         return new_schema
 
     @classmethod
-    def add_fields(cls, fields: dict[str, str]) -> Schema:
+    def add_fields(cls, fields: list[dict]) -> Schema:
         """Add fields to the schema
 
         Args:
-            fields: Dictionary mapping field names to their descriptions
+            fields: List of dictionaries, each containing 'name', 'desc', and 'type' keys
 
         Returns:
             A new Schema with the additional fields
         """
+        assert isinstance(fields, list), "fields must be a list of dictionaries"
+        for field in fields:
+            assert "name" in field, "fields must contain a 'name' key"
+            assert "desc" in field, "fields must contain a 'desc' key"
+            assert "type" in field, "fields must contain a 'type' key"
+
         # Construct the new schema name
         schema_name = cls.class_name()
         new_schema_name = f"{schema_name}Extended"
@@ -297,13 +304,14 @@ def add_fields(cls, fields: dict[str, str]) -> Schema:
         new_field_types = list(cls.field_map().values())
         new_field_descs = [field._desc for field in new_field_types]
 
-        # TODO: Users will provide explicit descriptions for the fields, 
-        # details in https://github.com/mitdbg/palimpzest/issues/84
-        for field_name, field_desc in fields.items():
+        # Process new fields from the list of dictionaries
+        for field in fields:
+            field_name = field["name"]
+            field_desc = field["desc"]
             if field_name in new_field_names:
                 continue
             new_field_names.append(field_name)
-            new_field_types.append(StringField(desc=field_desc))  # Assuming StringField for new fields
+            new_field_types.append(construct_field_from_python_type(field["type"], desc=field_desc))
             new_field_descs.append(field_desc)
 
         # Generate the schema class dynamically