Skip to content

Commit

Permalink
Merge pull request #150 from multinet-app/upload-downloads
Browse files Browse the repository at this point in the history
Make the network JSON uploader more flexible and allow JSON table uploads
  • Loading branch information
JackWilb authored Feb 28, 2023
2 parents b0e9383 + e0aef36 commit 8789de7
Show file tree
Hide file tree
Showing 14 changed files with 744 additions and 13 deletions.
18 changes: 18 additions & 0 deletions multinet/api/migrations/0013_alter_upload_data_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.18 on 2023-02-28 22:04

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0012_aqlquery_bind_vars'),
]

operations = [
migrations.AlterField(
model_name='upload',
name='data_type',
field=models.CharField(choices=[('CSV', 'Csv'), ('JSON', 'Json'), ('D3_JSON', 'D3 Json'), ('NESTED_JSON', 'Nested Json'), ('NEWICK', 'Newick')], max_length=20),
),
]
1 change: 1 addition & 0 deletions multinet/api/models/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Upload(Task):

class DataType(models.TextChoices):
CSV = 'CSV'
JSON = 'JSON'
D3_JSON = 'D3_JSON'
NESTED_JSON = 'NESTED_JSON'
NEWICK = 'NEWICK'
Expand Down
3 changes: 2 additions & 1 deletion multinet/api/tasks/upload/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .common import ProcessUploadTask
from .csv import process_csv
from .d3_json import process_d3_json
from .json_table import process_json_table

__all__ = ['ProcessUploadTask', 'process_csv', 'process_d3_json']
__all__ = ['ProcessUploadTask', 'process_csv', 'process_d3_json', 'process_json_table']
43 changes: 34 additions & 9 deletions multinet/api/tasks/upload/d3_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,44 @@
from multinet.api.models import Network, Table, Upload

from .common import ProcessUploadTask
from .exceptions import DataFormatError

logger = get_task_logger(__name__)


def d3_node_to_arango_doc(node: Dict) -> Dict:
new_node = dict(node)

# Return None if necessary
node_id = new_node.pop('id', None)
# Check if we have a field we can use for key. _key is preferred, then id
if '_key' in new_node.keys():
node_id = new_node.get('_key', None)
elif 'id' in new_node.keys():
node_id = new_node.pop('id', None)
else:
node_id = None

if node_id is None:
return None

# Assign and return
new_node['_key'] = str(node_id)
return new_node


def d3_link_to_arango_doc(link: Dict, node_table_name: str) -> Dict:
new_link = dict(link)

# Return None if necessary
source = new_link.pop('source', None)
target = new_link.pop('target', None)
# Check if we have a field we can use for from and to. _from and _to are preferred
# then source and target
if '_to' in new_link.keys() and '_from' in new_link.keys():
source = new_link.get('_to', None).split('/')[-1]
target = new_link.get('_from', None).split('/')[-1]
elif 'source' in new_link.keys() and 'target' in new_link.keys():
source = new_link.pop('source', None)
target = new_link.pop('target', None)
else:
source = None
target = None

if source is None or target is None:
return None

Expand Down Expand Up @@ -60,9 +75,19 @@ def process_d3_json(
for node in (d3_node_to_arango_doc(node) for node in d3_dict['nodes'])
if node is not None
]
d3_dict['links'] = [

if 'links' in d3_dict.keys():
link_property_name = 'links'
elif 'edges' in d3_dict.keys():
link_property_name = 'edges'
else:
raise DataFormatError("JSON network file missing 'links' or 'edges' property")

d3_dict[link_property_name] = [
link
for link in (d3_link_to_arango_doc(link, node_table_name) for link in d3_dict['links'])
for link in (
d3_link_to_arango_doc(link, node_table_name) for link in d3_dict[link_property_name]
)
if link is not None
]

Expand All @@ -80,7 +105,7 @@ def process_d3_json(

# Insert rows
node_table.put_rows(d3_dict['nodes'])
edge_table.put_rows(d3_dict['links'])
edge_table.put_rows(d3_dict[link_property_name])

# Create network
Network.create_with_edge_definition(
Expand Down
4 changes: 4 additions & 0 deletions multinet/api/tasks/upload/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class DataFormatError(Exception):
def __init__(self, message):
# Call the base class constructor with the parameters it needs
super().__init__(message)
73 changes: 73 additions & 0 deletions multinet/api/tasks/upload/json_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import json
from typing import Any, BinaryIO, Dict

from celery import shared_task

from multinet.api.models import Table, TableTypeAnnotation, Upload

from .common import ProcessUploadTask
from .utils import processor_dict


def process_row(row: Dict[str, Any], cols: Dict[str, TableTypeAnnotation.Type]) -> Dict:
new_row = dict(row)

# Check for _key or id, if missing, skip row
if not (new_row.get('_key') or new_row.get('id')):
return None

for col_key, col_type in cols.items():
entry = row.get(col_key)

# If null entry, skip
if entry is None:
continue

process_func = processor_dict.get(col_type)
if process_func is not None:
try:
new_row[col_key] = process_func(entry)
except ValueError:
# If error processing row, keep as string
pass

return new_row


@shared_task(base=ProcessUploadTask)
def process_json_table(
task_id: int,
table_name: str,
edge: bool,
columns: Dict[str, TableTypeAnnotation.Type],
) -> None:
upload: Upload = Upload.objects.get(id=task_id)

# Create new table
table: Table = Table.objects.create(
name=table_name,
edge=edge,
workspace=upload.workspace,
)

# Create type annotations
TableTypeAnnotation.objects.bulk_create(
[
TableTypeAnnotation(table=table, column=col_key, type=col_type)
for col_key, col_type in columns.items()
]
)

# Download data from S3/MinIO
with upload.blob as blob_file:
blob_file: BinaryIO = blob_file
imported_json = json.loads(blob_file.read().decode('utf-8'))

processed_rows = [
new_row
for new_row in [process_row(row, columns) for row in imported_json]
if new_row is not None
]

# Put rows in the table
table.put_rows(processed_rows)
27 changes: 27 additions & 0 deletions multinet/api/tests/data/characters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[
{
"_key": "0",
"name": "Myriel",
"group": "1"
},
{
"_key": "1",
"name": "Napoleon",
"group": "1"
},
{
"_key": "2",
"name": "Mlle.Baptistine",
"group": "1"
},
{
"_key": "3",
"name": "Mme.Magloire",
"group": "1"
},
{
"_key": "4",
"name": "CountessdeLo",
"group": "1"
}
]
Loading

0 comments on commit 8789de7

Please sign in to comment.