Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code update for https://github.com/mitdbg/palimpzest/issues/84 #101

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ class Email(Schema):

# Lazy construction of computation to filter for emails about holidays sent in July
dataset = dataset.convert(Email, desc="An email from the Enron dataset")
dataset = dataset.filter("The email was sent in July")
dataset = dataset.filter("The email is about holidays")
dataset = dataset.sem_filter("The email was sent in July")
dataset = dataset.sem_filter("The email is about holidays")

# Executing the compuation
policy = MinCost()
Expand Down
2 changes: 1 addition & 1 deletion demos/askem-var.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class Variable(Schema):
excerpts = Dataset(df_input, schema=Papersnippet)
output = excerpts.convert(
Variable, desc="A variable used or introduced in the context", cardinality=Cardinality.ONE_TO_MANY
).filter("The value name is 'a'", depends_on="name")
).sem_filter("The value name is 'a'", depends_on="name")
policy = MaxQuality()
config = QueryProcessorConfig(
policy=policy,
Expand Down
6 changes: 3 additions & 3 deletions demos/bdf-suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def extract_supplemental(processing_strategy, execution_strategy, optimizer_stra
def integrate_tables(processing_strategy, execution_strategy, optimizer_strategy, policy):
xls = Dataset("biofabric-tiny", schema=XLSFile)
patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The table contains biometric information about the patient")
patient_tables = patient_tables.sem_filter("The table contains biometric information about the patient")
case_data = patient_tables.convert(
CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY
)
Expand Down Expand Up @@ -162,7 +162,7 @@ def integrate_tables(processing_strategy, execution_strategy, optimizer_strategy
@st.cache_resource()
def extract_references(processing_strategy, execution_strategy, optimizer_strategy, policy):
papers = Dataset("bdf-usecase3-tiny", schema=ScientificPaper)
papers = papers.filter("The paper mentions phosphorylation of Exo1")
papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
references = papers.convert(
Reference, desc="A paper cited in the reference section", cardinality=Cardinality.ONE_TO_MANY
)
Expand Down Expand Up @@ -204,7 +204,7 @@ def extract_references(processing_strategy, execution_strategy, optimizer_strate
if run_pz:
# reference, plan, stats = run_workload()
papers = Dataset(dataset, schema=ScientificPaper)
papers = papers.filter("The paper mentions phosphorylation of Exo1")
papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
output = papers.convert(Reference, desc="The references cited in the paper", cardinality=Cardinality.ONE_TO_MANY)

# output = references
Expand Down
4 changes: 2 additions & 2 deletions demos/bdf-usecase3.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class Reference(Schema):
@st.cache_resource()
def run_workload():
papers = Dataset("bdf-usecase3-tiny", schema=ScientificPaper)
# papers = papers.filter("The paper mentions phosphorylation of Exo1")
# papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
references = papers.convert(
Reference, desc="A paper cited in the reference section", cardinality=Cardinality.ONE_TO_MANY
)
Expand Down Expand Up @@ -104,7 +104,7 @@ def run_workload():
if run_pz:
# reference, plan, stats = run_workload()
papers = Dataset(dataset, schema=ScientificPaper)
papers = papers.filter("The paper mentions phosphorylation of Exo1")
papers = papers.sem_filter("The paper mentions phosphorylation of Exo1")
output = papers.convert(Reference, desc="The references cited in the paper", cardinality=Cardinality.ONE_TO_MANY)

# output = references
Expand Down
6 changes: 3 additions & 3 deletions demos/biofabric-demo-matching.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@
"\n",
"xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
"patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
"patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
"patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
"\n",
"output = patient_tables\n",
"\n",
Expand Down Expand Up @@ -1113,7 +1113,7 @@
"\n",
"xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
"patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
"patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
"patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
"case_data = patient_tables.convert(CaseData, desc=\"The patient data in the table\", cardinality=Cardinality.ONE_TO_MANY)\n",
"\n",
"policy = MinCost()\n",
Expand Down Expand Up @@ -1604,7 +1604,7 @@
"\n",
"xls = Dataset('biofabric-tiny', schema=XLSFile)\n",
"patient_tables = xls.convert(Table, desc=\"All tables in the file\", cardinality=Cardinality.ONE_TO_MANY)\n",
"patient_tables = patient_tables.filter(\"The table contains biometric information about the patient\")\n",
"patient_tables = patient_tables.sem_filter(\"The table contains biometric information about the patient\")\n",
"case_data = patient_tables.convert(CaseData, desc=\"The patient data in the table\",cardinality=Cardinality.ONE_TO_MANY)\n",
"\n",
"policy = MaxQuality()\n",
Expand Down
12 changes: 6 additions & 6 deletions demos/biofabric-demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ def print_table(output):
elif experiment == "filtering":
xls = Dataset("biofabric-tiny", schema=XLSFile)
patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The rows of the table contain the patient age")
# patient_tables = patient_tables.filter("The table explains the meaning of attributes")
# patient_tables = patient_tables.filter("The table contains patient biometric data")
# patient_tables = patient_tables.filter("The table contains proteomic data")
# patient_tables = patient_tables.filter("The table records if the patient is excluded from the study")
patient_tables = patient_tables.sem_filter("The rows of the table contain the patient age")
# patient_tables = patient_tables.sem_filter("The table explains the meaning of attributes")
# patient_tables = patient_tables.sem_filter("The table contains patient biometric data")
# patient_tables = patient_tables.sem_filter("The table contains proteomic data")
# patient_tables = patient_tables.sem_filter("The table records if the patient is excluded from the study")
output = patient_tables

elif experiment == "matching":
Expand All @@ -122,7 +122,7 @@ def print_table(output):
elif experiment == "endtoend":
xls = Dataset("biofabric-tiny", schema=XLSFile)
patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The rows of the table contain the patient age")
patient_tables = patient_tables.sem_filter("The rows of the table contain the patient age")
case_data = patient_tables.convert(
CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY
)
Expand Down
8 changes: 4 additions & 4 deletions demos/demo_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def build_test_pdf_plan(dataset_id):
def build_mit_battery_paper_plan(dataset_id):
"""A dataset-independent declarative description of authors of good papers"""
sci_papers = Dataset(dataset_id, schema=ScientificPaper)
battery_papers = sci_papers.filter("The paper is about batteries")
mit_papers = battery_papers.filter("The paper is from MIT")
battery_papers = sci_papers.sem_filter("The paper is about batteries")
mit_papers = battery_papers.sem_filter("The paper is from MIT")
return mit_papers

def build_enron_plan(dataset_id):
Expand Down Expand Up @@ -111,15 +111,15 @@ def build_image_plan(dataset_id):
"""Build a plan for processing dog images"""
from palimpzest.sets import Dataset
images = Dataset(dataset_id, schema=ImageFile)
filtered_images = images.filter("The image contains one or more dogs")
filtered_images = images.sem_filter("The image contains one or more dogs")
dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
return dog_images

def build_image_agg_plan(dataset_id):
"""Build a plan for aggregating dog images by breed"""
from palimpzest.sets import Dataset
images = Dataset(dataset_id, schema=ImageFile)
filtered_images = images.filter("The image contains one or more dogs")
filtered_images = images.sem_filter("The image contains one or more dogs")
dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
ops = ["count"]
fields = ["breed"]
Expand Down
18 changes: 7 additions & 11 deletions demos/df-newinterface.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import pandas as pd

import palimpzest as pz
from palimpzest.query.processor.config import QueryProcessorConfig

df = pd.read_csv("testdata/enron-tiny.csv")
qr2 = pz.Dataset(df)
qr2 = qr2.add_columns({"sender": ("The email address of the sender", "string"),
"subject": ("The subject of the email", "string"),#
"date": ("The date the email was sent", "string")})
qr3 = qr2.filter("It is an email").filter("It has Vacation in the subject")
qr2 = qr2.sem_add_columns([
{"name" : "sender", "desc" : "The email address of the sender", "type" : "string"},
{"name" : "subject", "desc" : "The subject of the email", "type" : "string"},
{"name" : "date", "desc" : "The date the email was sent", "type" : "string"}
])

config = QueryProcessorConfig(
verbose=True,
execution_strategy="pipelined_parallel",
)

output = qr3.run(config)
qr3 = qr2.sem_filter("It is an email").sem_filter("It has Vacation in the subject")
output = qr3.run()
output_df = output.to_df()
print(output_df)

Expand Down
2 changes: 1 addition & 1 deletion demos/image-demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DogImage(ImageFile):

def build_image_plan(dataset_id):
images = Dataset(dataset_id, schema=ImageFile)
filtered_images = images.filter("The image contains one or more dogs")
filtered_images = images.sem_filter("The image contains one or more dogs")
dog_images = filtered_images.convert(DogImage, desc="Images of dogs")
return dog_images

Expand Down
6 changes: 3 additions & 3 deletions demos/optimizer-demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,10 +882,10 @@ def get_item(self, idx: int, val: bool = False, include_label: bool = False):
DataDirectory().register_user_source(src=datasource, dataset_id=user_dataset_id)

plan = Dataset(user_dataset_id, schema=Email)
plan = plan.filter(
plan = plan.sem_filter(
"The email is not quoting from a news article or an article written by someone outside of Enron"
)
plan = plan.filter(
plan = plan.sem_filter(
'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")'
)

Expand All @@ -910,7 +910,7 @@ def get_item(self, idx: int, val: bool = False, include_label: bool = False):
plan = Dataset(user_dataset_id, schema=RealEstateListingFiles)
plan = plan.convert(TextRealEstateListing, depends_on="text_content")
plan = plan.convert(ImageRealEstateListing, depends_on="image_filepaths")
plan = plan.filter(
plan = plan.sem_filter(
"The interior is modern and attractive, and has lots of natural sunlight",
depends_on=["is_modern_and_attractive", "has_natural_sunlight"],
)
Expand Down
8 changes: 4 additions & 4 deletions demos/paper-demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,10 @@ def get_item(self, idx: int):
if workload == "enron":
# datasetid="enron-eval" for paper evaluation
plan = Dataset(datasetid, schema=Email)
plan = plan.filter(
plan = plan.sem_filter(
"The email is not quoting from a news article or an article written by someone outside of Enron"
)
plan = plan.filter(
plan = plan.sem_filter(
'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")'
)

Expand All @@ -231,7 +231,7 @@ def get_item(self, idx: int):
plan = Dataset(user_dataset_id, schema=RealEstateListingFiles)
plan = plan.convert(TextRealEstateListing, depends_on="text_content")
plan = plan.convert(ImageRealEstateListing, depends_on="image_filepaths")
plan = plan.filter(
plan = plan.sem_filter(
"The interior is modern and attractive, and has lots of natural sunlight",
depends_on=["is_modern_and_attractive", "has_natural_sunlight"],
)
Expand All @@ -242,7 +242,7 @@ def get_item(self, idx: int):
# datasetid="biofabric-medium" for paper evaluation
plan = Dataset(datasetid, schema=XLSFile)
plan = plan.convert(Table, udf=xls_to_tables, cardinality=Cardinality.ONE_TO_MANY)
plan = plan.filter("The rows of the table contain the patient age")
plan = plan.sem_filter("The rows of the table contain the patient age")
plan = plan.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)

config = QueryProcessorConfig(
Expand Down
8 changes: 8 additions & 0 deletions docs/source/chat.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
PalimpChat
======

PalimpChat is an easy way to get started building Palimpzest pipelines through a chat interface using reasoning agents.
To access our demo please go to this webpage `Palimpchat <http://3.213.4.62:8888/>`_.

You can view a video `here <https://people.csail.mit.edu/chunwei/demo/palimpchat.mp4>`_.

2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
project = "Palimpzest"
copyright = "2025, MIT Data Systems Group"
author = "MIT Data Systems Group"
release = "0.5.2"
release = "0.5.3"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ For eager readers, the code in the notebook can be found in the following conden

# Lazy construction of computation to filter for emails about holidays sent in July
dataset = dataset.convert(Email, desc="An email from the Enron dataset")
dataset = dataset.filter("The email was sent in July")
dataset = dataset.filter("The email is about holidays")
dataset = dataset.sem_filter("The email was sent in July")
dataset = dataset.sem_filter("The email is about holidays")

# Executing the compuation
policy = MinCost()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "palimpzest"
version = "0.5.2" # if you update this, be sure to update package version in `docs/source/conf.py` as well
version = "0.5.3" # if you update this, be sure to update package version in `docs/source/conf.py` as well
description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
6 changes: 3 additions & 3 deletions quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
"### Step 3: Apply a Filter to the Emails\n",
"Now that we have the emails in the dataset, we can filter them to only retain the ones that mention a vacation plan and were sent in the month of July.\n",
"\n",
"To do this, we will use the `filter` function. This function takes a string which describes in natural language which condition we want the records to satisfy to pass the filter.\n",
"To do this, we will use the `sem_filter` function. This function takes a string which describes in natural language which condition we want the records to satisfy to pass the filter.\n",
"\n",
"When using natural language, you don't need to worry about implementing the filter itself, but the computation will be performed by LLM models. Such is the power of Palimpzest! "
]
Expand All @@ -168,8 +168,8 @@
"metadata": {},
"outputs": [],
"source": [
"dataset = dataset.filter(\"The email was sent in July\")\n",
"dataset = dataset.filter(\"The email is about holidays\")"
"dataset = dataset.sem_filter(\"The email was sent in July\")\n",
"dataset = dataset.sem_filter(\"The email is about holidays\")"
]
},
{
Expand Down
20 changes: 14 additions & 6 deletions src/palimpzest/core/lib/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NumericField,
StringField,
)
from palimpzest.utils.field_helpers import construct_field_from_python_type
from palimpzest.utils.hash_helpers import hash_for_temp_schema


Expand Down Expand Up @@ -276,15 +277,21 @@ def from_df(df: pd.DataFrame) -> Schema:
return new_schema

@classmethod
def add_fields(cls, fields: dict[str, str]) -> Schema:
def add_fields(cls, fields: list[dict]) -> Schema:
"""Add fields to the schema

Args:
fields: Dictionary mapping field names to their descriptions
fields: List of dictionaries, each containing 'name', 'desc', and 'type' keys

Returns:
A new Schema with the additional fields
"""
assert isinstance(fields, list), "fields must be a list of dictionaries"
for field in fields:
assert "name" in field, "fields must contain a 'name' key"
assert "desc" in field, "fields must contain a 'desc' key"
assert "type" in field, "fields must contain a 'type' key"

# Construct the new schema name
schema_name = cls.class_name()
new_schema_name = f"{schema_name}Extended"
Expand All @@ -297,13 +304,14 @@ def add_fields(cls, fields: dict[str, str]) -> Schema:
new_field_types = list(cls.field_map().values())
new_field_descs = [field._desc for field in new_field_types]

# TODO: Users will provide explicit descriptions for the fields,
# details in https://github.com/mitdbg/palimpzest/issues/84
for field_name, field_desc in fields.items():
# Process new fields from the list of dictionaries
for field in fields:
field_name = field["name"]
field_desc = field["desc"]
if field_name in new_field_names:
continue
new_field_names.append(field_name)
new_field_types.append(StringField(desc=field_desc)) # Assuming StringField for new fields
new_field_types.append(construct_field_from_python_type(field["type"], desc=field_desc))
new_field_descs.append(field_desc)

# Generate the schema class dynamically
Expand Down
Loading