Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose conversion settings in upload #134

Merged
merged 9 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions deepsearch/cps/cli/cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
containing coordinates of COS.""",
)

CONV_SETTINGS = typer.Option(
None,
"--conv-settings",
help="""Provide conversion settings to be used on local file upload""",
)

SOURCE_PATH = typer.Option(
None,
"--input-file",
Expand Down
17 changes: 17 additions & 0 deletions deepsearch/cps/cli/data_indices_typer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from deepsearch.cps.cli.cli_options import (
ATTACHMENT_KEY,
ATTACHMENT_PATH,
CONV_SETTINGS,
COORDINATES_PATH,
INDEX_ITEM_ID,
INDEX_KEY,
Expand All @@ -23,6 +24,7 @@
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.cps.data_indices import utils
from deepsearch.documents.core.common_routines import ERROR_MSG
from deepsearch.documents.core.models import ConversionSettings

app = typer.Typer(no_args_is_help=True)

Expand Down Expand Up @@ -135,6 +137,7 @@ def upload_files(
local_file: Path = SOURCE_PATH,
index_key: str = INDEX_KEY,
s3_coordinates: Path = COORDINATES_PATH,
conv_settings: Optional[str] = CONV_SETTINGS,
):
"""
Upload pdfs, zips, or online documents to a data index in a project
Expand All @@ -157,12 +160,26 @@ def upload_files(
raise typer.Abort()

coords = ElasticProjectDataCollectionSource(proj_key=proj_key, index_key=index_key)

if conv_settings is not None:
try:
final_conv_settings = ConversionSettings.parse_obj(
json.loads(conv_settings)
)
except json.JSONDecodeError:
raise ValueError(
"Could not parse a ConversionSettings object from --conv-settings flag"
)
else:
final_conv_settings = None

utils.upload_files(
api=api,
coords=coords,
url=urls,
local_file=local_file,
s3_coordinates=cos_coordinates,
conv_settings=final_conv_settings,
)

typer.echo("Tasks have been queued successfully")
Expand Down
2 changes: 1 addition & 1 deletion deepsearch/cps/client/components/data_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def delete(
def upload_file(
self,
coords: ElasticProjectDataCollectionSource,
body: Union[Dict[str, List[str]], Dict[str, Dict[str, Dict]]],
body: Dict[str, Any],
) -> str:
"""
Call api for converting and uploading file to a project's data index.
Expand Down
15 changes: 12 additions & 3 deletions deepsearch/cps/data_indices/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
from pathlib import Path
from typing import Any, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import urllib3

Expand All @@ -14,6 +14,7 @@
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.documents.core import convert, input_process
from deepsearch.documents.core.common_routines import progressbar
from deepsearch.documents.core.models import ConversionSettings
from deepsearch.documents.core.utils import cleanup, create_root_dir

logger = logging.getLogger(__name__)
Expand All @@ -25,6 +26,7 @@ def upload_files(
url: Optional[Union[str, List[str]]] = None,
local_file: Optional[Union[str, Path]] = None,
s3_coordinates: Optional[S3Coordinates] = None,
conv_settings: Optional[ConversionSettings] = None,
):
"""
Orchestrate document conversion and upload to an index in a project
Expand All @@ -47,6 +49,7 @@ def upload_files(
api=api,
coords=coords,
local_file=Path(local_file),
conv_settings=conv_settings,
)
elif url is None and local_file is None and s3_coordinates is not None:
return process_external_cos(
Expand Down Expand Up @@ -101,6 +104,7 @@ def process_local_file(
coords: ElasticProjectDataCollectionSource,
local_file: Path,
progress_bar: bool = False,
conv_settings: Optional[ConversionSettings] = None,
):
"""
Individual files are uploaded for conversion and storage in data index.
Expand Down Expand Up @@ -130,7 +134,7 @@ def process_local_file(
count_total_files = len(files_zip)

# container for task_ids
task_ids = []
task_ids: List[str] = []

# start loop
with tqdm(
Expand All @@ -147,7 +151,12 @@ def process_local_file(
api=api, cps_proj_key=coords.proj_key, source_path=Path(single_zip)
)
file_url_array = [private_download_url]
payload = {"file_url": file_url_array}
payload: Dict[str, Any] = {
"file_url": file_url_array,
}
if conv_settings is not None:
payload["conversion_settings"] = conv_settings.dict()

task_id = api.data_indices.upload_file(coords=coords, body=payload)
task_ids.append(task_id)
progress.update(1)
Expand Down