Skip to content

Commit

Permalink
bugfix/drop use of ndjson dep, use local code (#3886)
Browse files Browse the repository at this point in the history
### Description
Avoid using the ndjson dependency due to the limiting license that
exists on it
  • Loading branch information
rbiseck3 authored Jan 24, 2025
1 parent 8f2a719 commit e230364
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 11 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.16-dev0
## 0.16.16-dev1

### Enhancements

Expand All @@ -7,6 +7,8 @@

### Fixes

* **Drop usage of ndjson dependency**

## 0.16.15

### Enhancements
Expand Down
1 change: 0 additions & 1 deletion requirements/base.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,3 @@ tqdm
psutil
python-oxmsg
html5lib
ndjson
2 changes: 0 additions & 2 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ mypy-extensions==1.0.0
# via
# typing-inspect
# unstructured-client
ndjson==0.3.1
# via -r ./base.in
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.9.1
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.16-dev0" # pragma: no cover
__version__ = "0.16.16-dev1" # pragma: no cover
67 changes: 67 additions & 0 deletions unstructured/file_utils/ndjson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Adds support for working with newline-delimited JSON (ndjson) files. This format is useful for
streaming json content that would otherwise not be possible using raw JSON files.
"""

import json
from typing import IO, Any


def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
"""
Converts the list of dictionaries into string representation
Args:
obj (list[dict[str, Any]]): List of dictionaries to convert
**kwargs: Additional keyword arguments to pass to json.dumps
Returns:
str: string representation of the list of dictionaries
"""
return "\n".join(json.dumps(each, **kwargs) for each in obj)


def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
"""
Writes the list of dictionaries to a newline-delimited file
Args:
obj (list[dict[str, Any]]): List of dictionaries to convert
fp (IO): File pointer to write the string representation to
**kwargs: Additional keyword arguments to pass to json.dumps
Returns:
None
"""
# Indent breaks ndjson formatting
kwargs["indent"] = None
text = dumps(obj, **kwargs)
fp.write(text)


def loads(s: str, **kwargs) -> list[dict[str, Any]]:
"""
Converts the raw string into a list of dictionaries
Args:
s (str): Raw string to convert
**kwargs: Additional keyword arguments to pass to json.loads
Returns:
list[dict[str, Any]]: List of dictionaries parsed from the input string
"""
return [json.loads(line, **kwargs) for line in s.splitlines()]


def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
"""
Converts the contents of the file into a list of dictionaries
Args:
fp (IO): File pointer to read the string representation from
**kwargs: Additional keyword arguments to pass to json.loads
Returns:
list[dict[str, Any]]: List of dictionaries parsed from the file
"""
return loads(fp.read(), **kwargs)
5 changes: 2 additions & 3 deletions unstructured/partition/ndjson.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
import json
from typing import IO, Any, Optional

import ndjson

from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
is_ndjson_processable,
)
from unstructured.file_utils.ndjson import loads as ndjson_loads
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.staging.base import elements_from_dicts
Expand Down Expand Up @@ -74,7 +73,7 @@ def partition_ndjson(
)

try:
element_dicts = ndjson.loads(file_text)
element_dicts = ndjson_loads(file_text)
elements = elements_from_dicts(element_dicts)
except json.JSONDecodeError:
raise ValueError("Not a valid ndjson")
Expand Down
5 changes: 2 additions & 3 deletions unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
from datetime import datetime
from typing import Any, Iterable, Optional, Sequence, cast

import ndjson

from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
CheckBox,
Element,
ElementMetadata,
)
from unstructured.file_utils.ndjson import dumps as ndjson_dumps
from unstructured.partition.common.common import exactly_one
from unstructured.utils import Point, dependency_exists, requires_dependencies

Expand Down Expand Up @@ -168,7 +167,7 @@ def elements_to_ndjson(
# -- serialize `elements` as a JSON array (str) --
precision_adjusted_elements = _fix_metadata_field_precision(elements)
element_dicts = elements_to_dicts(precision_adjusted_elements)
ndjson_str = ndjson.dumps(element_dicts, sort_keys=True)
ndjson_str = ndjson_dumps(element_dicts, sort_keys=True)

if filename is not None:
with open(filename, "w", encoding=encoding) as f:
Expand Down

0 comments on commit e230364

Please sign in to comment.