Skip to content

Commit

Permalink
updating docs
Browse files Browse the repository at this point in the history
  • Loading branch information
rbyh committed Feb 5, 2025
2 parents 9541d96 + 9db1bc9 commit 5c94407
Show file tree
Hide file tree
Showing 9 changed files with 698 additions and 8 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

## [0.1.43] - TBD
### Added
- Method `ScenarioList._from_pdf_to_image(<filename>)` generates a scenario for each page of a pdf converted into a jpeg (to use as an image instead of converting to text).

### Changed
### Fixed
- A bug preventing iterations on remote inference.


## [0.1.42] - 2025-01-24
Expand Down
562 changes: 562 additions & 0 deletions docs/requirements.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion edsl/agents/Invigilator.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def _extract_edsl_result_entry_and_validate(
self.question.question_options = new_question_options

question_with_validators = self.question.render(
self.scenario | prior_answers_dict
self.scenario | prior_answers_dict | {'agent':self.agent.traits}
)
question_with_validators.use_code = self.question.use_code
else:
Expand Down
5 changes: 4 additions & 1 deletion edsl/coop/coop.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ def remote_cache_create_many(
def remote_cache_get(
self,
exclude_keys: Optional[list[str]] = None,
select_keys: Optional[list[str]] = None,
) -> list[CacheEntry]:
"""
Get all remote cache entries.
Expand All @@ -560,10 +561,12 @@ def remote_cache_get(
"""
if exclude_keys is None:
exclude_keys = []
if select_keys is None:
select_keys = []
response = self._send_server_request(
uri="api/v0/remote-cache/get-many",
method="POST",
payload={"keys": exclude_keys},
payload={"keys": exclude_keys, "selected_keys": select_keys},
timeout=40,
)
self._resolve_server_response(response)
Expand Down
1 change: 1 addition & 0 deletions edsl/questions/question_base_gen_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def render_string(value: str) -> str:
.render(strings_only_replacement_dict)
)
except Exception as e:
#breakpoint()
import warnings

warnings.warn("Failed to render string: " + value)
Expand Down
33 changes: 33 additions & 0 deletions edsl/scenarios/Scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,39 @@ def from_pdf(cls, pdf_path: str):
extractor = PdfExtractor(pdf_path)
return Scenario(extractor.get_pdf_dict())

@classmethod
def from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
"""
Convert each page of a PDF into an image and create key/value for it.
:param pdf_path: Path to the PDF file.
:param image_format: Format of the output images (default is 'jpeg').
:return: ScenarioList instance containing the Scenario instances.
The scenario has a key "filepath" and one or more keys "page_{i}" for each page.
"""
import tempfile
from pdf2image import convert_from_path
from edsl.scenarios import Scenario

with tempfile.TemporaryDirectory() as output_folder:
# Convert PDF to images
images = convert_from_path(pdf_path)

scenario_dict = {"filepath":pdf_path}

# Save each page as an image and create Scenario instances
for i, image in enumerate(images):
image_path = os.path.join(output_folder, f"page_{i}.{image_format}")
image.save(image_path, image_format.upper())

from edsl import FileStore
scenario_dict[f"page_{i}"] = FileStore(image_path)

scenario = Scenario(scenario_dict)

return cls(scenario)

@classmethod
def from_docx(cls, docx_path: str) -> "Scenario":
"""Creates a scenario from the text of a docx file.
Expand Down
25 changes: 22 additions & 3 deletions edsl/scenarios/ScenarioList.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,7 +1135,7 @@ def from_excel(
return cls(observations)

@classmethod
def from_google_sheet(cls, url: str, sheet_name: str = None) -> ScenarioList:
def from_google_sheet(cls, url: str, sheet_name: str = None, column_names: Optional[List[str]]= None) -> ScenarioList:
"""Create a ScenarioList from a Google Sheet.
This method downloads the Google Sheet as an Excel file, saves it to a temporary file,
Expand All @@ -1145,6 +1145,8 @@ def from_google_sheet(cls, url: str, sheet_name: str = None) -> ScenarioList:
url (str): The URL to the Google Sheet.
sheet_name (str, optional): The name of the sheet to load. If None, the method will behave
the same as from_excel regarding multiple sheets.
column_names (List[str], optional): If provided, use these names for the columns instead
of the default column names from the sheet.
Returns:
ScenarioList: An instance of the ScenarioList class.
Expand Down Expand Up @@ -1172,8 +1174,25 @@ def from_google_sheet(cls, url: str, sheet_name: str = None) -> ScenarioList:
temp_file.write(response.content)
temp_filename = temp_file.name

# Call the from_excel class method with the temporary file
return cls.from_excel(temp_filename, sheet_name=sheet_name)
# First create the ScenarioList with default column names
scenario_list = cls.from_excel(temp_filename, sheet_name=sheet_name)

# If column_names is provided, create a new ScenarioList with the specified names
if column_names is not None:
if len(column_names) != len(scenario_list[0].keys()):
raise ValueError(
f"Number of provided column names ({len(column_names)}) "
f"does not match number of columns in sheet ({len(scenario_list[0].keys())})"
)

# Create a codebook mapping original keys to new names
original_keys = list(scenario_list[0].keys())
codebook = dict(zip(original_keys, column_names))

# Return new ScenarioList with renamed columns
return scenario_list.rename(codebook)
else:
return scenario_list

@classmethod
def from_delimited_file(
Expand Down
4 changes: 1 addition & 3 deletions edsl/scenarios/ScenarioListPdfMixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def is_url(string):
return False

@classmethod
def _from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
def from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
"""
Convert each page of a PDF into an image and create Scenario instances.
Expand All @@ -173,7 +173,6 @@ def _from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
image_path = os.path.join(output_folder, f"page_{i+1}.{image_format}")
image.save(image_path, image_format.upper())

# scenario = Scenario._from_filepath_image(image_path)
from edsl import FileStore
scenario = Scenario({
"filepath":image_path,
Expand All @@ -182,7 +181,6 @@ def _from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
})
scenarios.append(scenario)

# print(f"Saved {len(images)} pages as images in {output_folder}")
return cls(scenarios)

@staticmethod
Expand Down
71 changes: 71 additions & 0 deletions test_question_base_gen_mixin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import pytest
from unittest.mock import patch
from edsl import QuestionFreeText, QuestionMultipleChoice, ScenarioList, Agent, Model


class TestQuestionBaseGenMixin:
def test_copy(self):
"""Test that the copy method returns a deep copy."""
q = QuestionFreeText(question_name="color", question_text="What is your favorite color?")
q_copy = q.copy()

assert q_copy is not q # Ensure it's a new object
assert q_copy.question_name == q.question_name
assert q_copy.question_text == q.question_text

def test_option_permutations(self):
"""Test that option_permutations generates correct permutations."""
q = QuestionMultipleChoice(question_name="fruit", question_text="Pick a fruit", question_options=["Apple", "Banana", "Cherry"])
permutations = q.option_permutations()

assert len(permutations) == 6 # 3! = 6 permutations
assert all(len(p.question_options) == len(q.question_options) for p in permutations)

def test_draw(self):
"""Test that draw returns a new question with shuffled options."""
q = QuestionMultipleChoice(question_name="drink", question_text="Pick a drink", question_options=["Tea", "Coffee", "Juice"])

with patch("random.sample", return_value=["Juice", "Tea", "Coffee"]):
drawn = q.draw()

assert drawn is not q
assert set(drawn.question_options) == set(q.question_options)
assert drawn.question_options == ["Juice", "Tea", "Coffee"]

def test_loop(self):
"""Test that loop creates correctly named questions based on scenarios."""
q = QuestionFreeText(question_name="base_{{subject}}", question_text="What are your thoughts on: {{subject}}?")
scenarios = ScenarioList.from_list("subject", ["Math", "Economics", "Chemistry"])
looped_questions = q.loop(scenarios)

assert len(looped_questions) == 3
assert looped_questions[0].question_name == "base_Math"
assert looped_questions[1].question_name == "base_Economics"
assert looped_questions[2].question_name == "base_Chemistry"

def test_render(self):
"""Test that render correctly replaces variables in text."""
m = Model("test")
a = Agent(traits = {"hair_color":"red"})
q = QuestionFreeText(question_name = "test", question_text = "How do you say '{{ agent.hair_color }}' in German?")
rendered_q = q.render({"agent.hair_color": "red"})

assert rendered_q.question_text == "How do you say 'red' in German?"

rendered_q.run(disable_remote_inference=True, stop_on_exception=True)


def test_apply_function(self):
"""Test that apply_function transforms question fields correctly."""
q = QuestionFreeText(question_name="color", question_text="What is your favorite color?")
upper_case_func = lambda x: x.upper()

transformed_q = q.apply_function(upper_case_func)

assert transformed_q.question_text == "WHAT IS YOUR FAVORITE COLOR?"
assert transformed_q.question_name == "color" # Should remain unchanged


if __name__ == "__main__":
import doctest
doctest.testmod()

0 comments on commit 5c94407

Please sign in to comment.