Skip to content

Commit

Permalink
add sentence splitting of file content
Browse files Browse the repository at this point in the history
  • Loading branch information
Jabbawukis committed Oct 14, 2024
1 parent 6cd9b95 commit bb7915d
Show file tree
Hide file tree
Showing 7 changed files with 410 additions and 135 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ help:
## run-workflow: Run the workflow pipeline locally for quick evaluation.
.PHONY: run-workflow
run-workflow:
pip install .
pip install .[dev]
pytest tests/
black src/
pylint src/
135 changes: 135 additions & 0 deletions cool_demo.ipynb

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions cool_demo.py

This file was deleted.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ dependencies = [
"transformers>=4.38",
"tqdm",
"datasets",
"Whoosh-Reloaded"
"Whoosh-Reloaded",
"spacy",
"numpy",
]
dynamic = ["version"]

Expand Down
48 changes: 41 additions & 7 deletions src/sample_efficiency_evaluation/fact_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import hashlib

from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm
from whoosh.index import create_in, FileIndex
from whoosh.index import create_in, open_dir, FileIndex
from whoosh.fields import Schema, TEXT, ID
from whoosh.writing import SegmentWriter
from whoosh.qparser import QueryParser, query
from spacy.lang.en import English

from utility import utility

Expand Down Expand Up @@ -42,6 +44,8 @@ def __init__(self, **kwargs):
- file_index_dir: Path to the index directory.
This is the directory where the index will be stored. If not provided, it will be set to "indexdir".
- read_existing_index: If True, it will read the existing index. If False, it will create a new index.
"""
self.bear_data_path = kwargs.get("bear_data_path")

Expand All @@ -55,13 +59,20 @@ def __init__(self, **kwargs):

self.bear_relation_info_dict: dict = utility.load_json_dict(self.bear_relation_info_path)

self.entity_relation_info_dict: dict = self.extract_entity_information(self.bear_facts_path)
self.entity_relation_info_dict: dict = self._extract_entity_information(self.bear_facts_path)

self.writer, self.indexer = self.initialize_index(index_path)
if kwargs.get("read_existing_index", False):
self.writer, self.indexer = self._open_existing_index_dir(index_path)
else:
self.writer, self.indexer = self._initialize_index(index_path)

self.query_parser = QueryParser("content", schema=self.indexer.schema)

def extract_entity_information(self, bear_data_path: str) -> dict:
self.nlp_pipeline = English()

self.nlp_pipeline.add_pipe("sentencizer")

def _extract_entity_information(self, bear_data_path: str) -> dict:
"""
Extract entity information from bear data.
:param bear_data_path: Path to bear data directory
Expand Down Expand Up @@ -93,23 +104,32 @@ def index_file(self, file_content: str) -> None:
doc_hash = str(hashlib.sha256(file_content.encode()).hexdigest())
self.writer.add_document(title=doc_hash, path=f"/{doc_hash}", content=file_content)

def index_dataset(self, file_contents: list[dict], text_key: str = "text") -> None:
def index_dataset(
self, file_contents: list[dict], text_key: str = "text", split_contents_into_sentences: bool = False
) -> None:
"""
Index dataset files, the dataset is a list of file contents.
:param text_key: Key to extract text from file content. Since the dataset is a list of file contents, we need to
specify the key to extract text from the file content. That would be the case if we pass a huggingface dataset.
:param file_contents: List of file contents
:param split_contents_into_sentences: Apply sentence splitting to the text before indexing.
:return:
"""
for file_content in tqdm(file_contents, desc="Indexing dataset"):
self.index_file(file_content[text_key])
if split_contents_into_sentences:
split_doc = self.nlp_pipeline(file_content[text_key])
with ThreadPoolExecutor() as executor:
sentences = [sent.text for sent in split_doc.sents]
executor.map(self.index_file, sentences)
else:
self.index_file(file_content[text_key])
self.commit_index()

def commit_index(self) -> None:
self.writer.commit()

@staticmethod
def initialize_index(index_path) -> tuple[SegmentWriter, FileIndex]:
def _initialize_index(index_path) -> tuple[SegmentWriter, FileIndex]:
"""
Initialize index writer and indexer.
:param index_path:
Expand All @@ -122,6 +142,20 @@ def initialize_index(index_path) -> tuple[SegmentWriter, FileIndex]:
writer = indexer.writer()
return writer, indexer

@staticmethod
def _open_existing_index_dir(index_path) -> tuple[SegmentWriter, FileIndex]:
"""
Open an already existing index directory and return writer and indexer.
If the index directory does not exist, it will raise an error.
Within the directory, there should be one index file.
:param index_path:
:return:
"""
indexer = open_dir(index_path)
writer = indexer.writer()
return writer, indexer

@abstractmethod
def search_index(self, main_query: str, sub_query: str = "") -> list[dict]:
"""
Expand Down
115 changes: 0 additions & 115 deletions tests/test_fact_matcher.py

This file was deleted.

Loading

0 comments on commit bb7915d

Please sign in to comment.