diff --git a/.mlem.yaml b/.mlem.yaml deleted file mode 100644 index 410b7ba..0000000 --- a/.mlem.yaml +++ /dev/null @@ -1,3 +0,0 @@ -core: - storage: - type: dvc diff --git a/Dockerfile b/Dockerfile index da11120..2c83174 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ -FROM tiangolo/uvicorn-gunicorn-fastapi:python3.8 +ARG BASE_IMAGE=tiangolo/uvicorn-gunicorn-fastapi:python3.10 +FROM ${BASE_IMAGE} # Gunicorn image 3.4G: https://github.com/tiangolo/uvicorn-gunicorn-docker/tree/master/docker-images @@ -8,53 +9,43 @@ LABEL org.opencontainers.image.source="https://github.com/MaastrichtU-IDS/transl USER root WORKDIR /app -# Java 11 required for Spark to work -RUN echo 'deb http://ftp.fr.debian.org/debian bullseye main' >> /etc/apt/sources.list.d/bullseye.list && \ - apt-get update && \ - apt-get install -y build-essential wget curl vim openjdk-11-jdk && \ +RUN apt-get update && \ + apt-get install -y build-essential wget curl vim && \ pip install --upgrade pip +# RUN curl -sSf https://rye-up.com/get | RYE_INSTALL_OPTION="--yes" bash -# TODO: remove? Install Spark for standalone context in /opt -ENV APACHE_SPARK_VERSION=3.2.0 -ENV HADOOP_VERSION=3.2 -ENV SPARK_HOME=/opt/spark -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx2048M --driver-java-options=-Dlog4j.logLevel=info" -ENV PATH="${PATH}:${SPARK_HOME}/bin" -RUN wget -q -O spark.tgz https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ - tar xzf spark.tgz -C /opt && \ - rm "spark.tgz" && \ - ln -s "/opt/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" $SPARK_HOME -RUN echo "log4j.rootCategory=ERROR, console" > $SPARK_HOME/conf/log4j.properties -# RUN chown -R 1000:1000 /opt/spark - -# Define some environment variables for pyspark and gunicorn config -ENV PYSPARK_PYTHON=/usr/local/bin/python3 -ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3 - - - -ENV PORT=8808 -ENV GUNICORN_CMD_ARGS="--preload" +ENV PORT=8808 \ + GUNICORN_CMD_ARGS="--preload" \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + ACCESS_LOG="-" \ + ERROR_LOG="-" \ + OPENTELEMETRY_ENABLED=false # Use requirements.txt to install some dependencies only when needed -COPY requirements.txt . -RUN pip install -r requirements.txt +# COPY requirements.txt . +# RUN pip install -r requirements.txt ## Copy the source code (in the same folder as the Dockerfile) COPY . . -ENV MODULE_NAME=trapi.main -ENV VARIABLE_NAME=app +ENV MODULE_NAME=trapi.main \ + VARIABLE_NAME=app + +# WORKDIR /app/trapi-openpredict + +# RUN pip install -e /app/predict-drug-target /app/trapi-predict-kit +RUN pip install -e . -RUN pip install -e ".[train,test]" -# RUN pip install -e ./trapi-predict-kit +# RUN pip install -e . /app/predict-drug-target /app/trapi-predict-kit +# RUN pip install -e /app/trapi-predict-kit RUN dvc pull -f EXPOSE 8808 -# ENTRYPOINT [ "gunicorn", "-w", "8", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8808", "trapi.main:app"] +# ENTRYPOINT [ "gunicorn", "-w", "8", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8808", "src.trapi_oprenpredict.main:app"] # Build entrypoint script to pull latest dvc changes before startup RUN echo "#!/bin/bash" > /entrypoint.sh && \ diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index f578a69..00172c3 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -21,14 +21,15 @@ services: # OPENPREDICT_APIKEY: ${OPENPREDICT_APIKEY} - jaeger-otel-agent.sri: - image: jaegertracing/all-in-one:latest - # ports: - # - "16686:16686" - # - "4318:4318" - # - "6831:6831/udp" - environment: - LOG_LEVEL: debug - VIRTUAL_HOST: jaeger.137.120.31.102.nip.io - LETSENCRYPT_HOST: jaeger.137.120.31.102.nip.io - VIRTUAL_PORT: 16686 \ No newline at end of file + # NOTE: not required, just for testing + # jaeger-otel-agent.sri: + # image: jaegertracing/all-in-one:latest + # # ports: + # # - "16686:16686" + # # - "4318:4318" + # # - "6831:6831/udp" + # environment: + # LOG_LEVEL: debug + # VIRTUAL_HOST: jaeger.137.120.31.102.nip.io + # LETSENCRYPT_HOST: jaeger.137.120.31.102.nip.io + # VIRTUAL_PORT: 16686 diff --git a/docker-compose.test.yml b/docker-compose.test.yml new file mode 100644 index 0000000..1b5df06 --- /dev/null +++ b/docker-compose.test.yml @@ -0,0 +1,49 @@ +version: "3" +services: + + # Container used for starting a container to run training for original OpenPredict drug-disease model + train: + build: + context: . + dockerfile: src/openpredict_model/Dockerfile + ports: + - 8808:8808 + volumes: + - ./:/app + environment: + # Show print() in logs: + PYTHONUNBUFFERED: '1' + LOG_LEVEL: 'INFO' + NO_JAEGER: "true" + entrypoint: sleep infinity + + + # Container used for testing and running scripts + tests: + build: . + volumes: + - ./:/app + - ~/.nanopub-docker:/root/.nanopub + environment: + PYTHONUNBUFFERED: '1' + LOG_LEVEL: 'INFO' + NO_JAEGER: "true" + entrypoint: pytest --cov=src tests/integration + # entrypoint: pytest tests/integration/test_train_model.py -s + # entrypoint: pytest tests/integration/test_openpredict_api.py::test_post_trapi -s + # entrypoint: pytest tests/package/test_decorator.py -s + + + # Container to deploy a JupyterLab/VSCode workspace for development + # workspace: + # image: ghcr.io/maastrichtu-ids/jupyterlab + # ports: + # - 8888:8888 + # volumes: + # - ./:/home/jovyan/work + # user: root + # environment: + # - GRANT_SUDO=yes + # - LOG_LEVEL=INFO + # ## With password: + # # - JUPYTER_TOKEN=password diff --git a/docker-compose.yml b/docker-compose.yml index 5e11209..bc6a770 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,35 +13,4 @@ services: PYTHONUNBUFFERED: '1' LOG_LEVEL: 'INFO' NO_JAEGER: "true" - entrypoint: uvicorn trapi.main:app --host 0.0.0.0 --port 8808 --debug --reload - - - # Container used for testing and running scripts - tests: - build: . - volumes: - - ./:/app - - ~/.nanopub-docker:/root/.nanopub - environment: - PYTHONUNBUFFERED: '1' - LOG_LEVEL: 'INFO' - NO_JAEGER: "true" - entrypoint: pytest --cov=src tests/integration - # entrypoint: pytest tests/integration/test_train_model.py -s - # entrypoint: pytest tests/integration/test_openpredict_api.py::test_post_trapi -s - # entrypoint: pytest tests/package/test_decorator.py -s - - - # Container to deploy a JupyterLab/VSCode workspace for development - # workspace: - # image: ghcr.io/maastrichtu-ids/jupyterlab - # ports: - # - 8888:8888 - # volumes: - # - ./:/home/jovyan/work - # user: root - # environment: - # - GRANT_SUDO=yes - # - LOG_LEVEL=INFO - # ## With password: - # # - JUPYTER_TOKEN=password + entrypoint: uvicorn trapi.main:app --host 0.0.0.0 --port 8808 --reload diff --git a/pyproject.toml b/pyproject.toml index 103186b..bb76176 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,32 +30,28 @@ classifiers = [ dynamic = ["version"] dependencies = [ - "requests >=2.23.0", - # "trapi-predict-kit[opentelemetry] >=0.2.3", - "trapi-predict-kit @ git+https://github.com/MaastrichtU-IDS/trapi-predict-kit.git", - # "trapi-predict-kit @ {root:uri}/trapi-predict-kit", + "numpy", + "pandas", + "scikit-learn", + "gensim", # For word2vec + "scipy==1.10.1", # https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-gensim - "pydantic >=1.9", - "fastapi >=0.68.1", + "requests >=2.23.0", "rdflib >=6.1.1", "SPARQLWrapper >=2.0.0,<3.0.0", - # "mlem", - "mlem >=0.4.0", "reasoner-pydantic >=4.1.4", + # "trapi-predict-kit[web,opentelemetry] >=0.2.3", + "trapi-predict-kit[web,opentelemetry] @ git+https://github.com/MaastrichtU-IDS/trapi-predict-kit.git", + # "trapi-predict-kit @ {root:uri}/trapi-predict-kit", + "predict-drug-target @ git+https://github.com/MaastrichtU-IDS/predict-drug-target.git", + # "predict-drug-target", # Load from workspace - "opentelemetry-sdk", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-exporter-jaeger", - "opentelemetry-instrumentation-fastapi", - "opentelemetry-instrumentation-httpx", - "opentelemetry-instrumentation-requests", - # "fairworkflows", - # "fairworkflows@git+https://github.com/vemonet/fairworkflows.git", - # "yatiml >=0.10.0", + # "pydantic >=1.9", ] [project.optional-dependencies] train = [ + # Dependencies to train the original OpenPredict drug-disease model "numpy ==1.16.1", # requires python 3.8 "pandas ==1.1.1", "torch ==1.13.1", diff --git a/src/openpredict_model/Dockerfile b/src/openpredict_model/Dockerfile new file mode 100644 index 0000000..1cb2e9b --- /dev/null +++ b/src/openpredict_model/Dockerfile @@ -0,0 +1,67 @@ +FROM tiangolo/uvicorn-gunicorn-fastapi:python3.8 +# Gunicorn image 3.4G: https://github.com/tiangolo/uvicorn-gunicorn-docker/tree/master/docker-images + +# NOTE: Dockerfile to train original OpenPredict drug-disease model + +LABEL org.opencontainers.image.source="https://github.com/MaastrichtU-IDS/translator-openpredict" + +# Change the current user to root and the working directory to /app +USER root +WORKDIR /app + +# Java 11 required for Spark to work +RUN echo 'deb http://ftp.fr.debian.org/debian bullseye main' >> /etc/apt/sources.list.d/bullseye.list && \ + apt-get update && \ + apt-get install -y build-essential wget curl vim openjdk-11-jdk && \ + pip install --upgrade pip + + +# TODO: remove? Install Spark for standalone context in /opt +ENV APACHE_SPARK_VERSION=3.2.0 +ENV HADOOP_VERSION=3.2 +ENV SPARK_HOME=/opt/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx2048M --driver-java-options=-Dlog4j.logLevel=info" +ENV PATH="${PATH}:${SPARK_HOME}/bin" +RUN wget -q -O spark.tgz https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ + tar xzf spark.tgz -C /opt && \ + rm "spark.tgz" && \ + ln -s "/opt/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" $SPARK_HOME +RUN echo "log4j.rootCategory=ERROR, console" > $SPARK_HOME/conf/log4j.properties +# RUN chown -R 1000:1000 /opt/spark + +# Define some environment variables for pyspark and gunicorn config +ENV PYSPARK_PYTHON=/usr/local/bin/python3 +ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3 + + + +ENV PORT=8808 +ENV GUNICORN_CMD_ARGS="--preload" + +# Use requirements.txt to install some dependencies only when needed +COPY requirements.txt . +RUN pip install -r requirements.txt + +## Copy the source code (in the same folder as the Dockerfile) +COPY . . + +ENV MODULE_NAME=trapi.main +ENV VARIABLE_NAME=app + +RUN pip install -e ".[train,test]" +# RUN pip install -e ./trapi-predict-kit + +RUN dvc pull -f + +EXPOSE 8808 + +# ENTRYPOINT [ "gunicorn", "-w", "8", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8808", "trapi.main:app"] + +# Build entrypoint script to pull latest dvc changes before startup +RUN echo "#!/bin/bash" > /entrypoint.sh && \ + echo "dvc pull" >> /entrypoint.sh && \ + echo "/start.sh" >> /entrypoint.sh && \ + chmod +x /entrypoint.sh + + +CMD [ "/entrypoint.sh" ] diff --git a/src/openpredict_model/api.py b/src/openpredict_model/api.py index 75e58c6..2052fdc 100644 --- a/src/openpredict_model/api.py +++ b/src/openpredict_model/api.py @@ -1,15 +1,10 @@ -import os -import sys from datetime import datetime from enum import Enum -from typing import Optional -from fastapi import APIRouter, File, Query, UploadFile +from fastapi import APIRouter, Query from rdflib import Graph from openpredict_model.evidence_path.predict import do_evidence_path -from openpredict_model.explain_shap.explain_shap import get_explanations -from openpredict_model.train import add_embedding from openpredict_model.utils import retrieve_features, retrieve_models @@ -35,7 +30,8 @@ class EmbeddingTypes(str, Enum): models_g = Graph() models_g.parse("models/openpredict_baseline.ttl") - +# NOTE: commented due to issues with dependencies related to gensim: +# ImportError: cannot import name 'triu' from 'scipy.linalg' (/usr/local/lib/python3.10/site-packages/scipy/linalg/__init__.py) @api.get("/evidence-path", name="Get the evidence path between two entities", description="""Get the evidence path between two entities. The evidence path is generated using the overall similarity score by default. You could change the included features by defining the names of the features. @@ -101,64 +97,6 @@ def get_evidence_path( -@api.get("/explain-shap", name="Get calculated shap explanations for predicted drug for a given disease", - description="""Return the explanations for predicted entities for a given disease with SHAP values for feature importances: drug (DrugBank ID) or disease (OMIM ID), with confidence scores. -a disease_id can be provided, -This operation is annotated with x-bte-kgs-operations, and follow the BioThings API recommendations. - -You can try: - -| disease_id: `OMIM:246300` | - -| to check the drug prediction explanations for a disease | -""", - response_model=dict, - tags=["openpredict"], -) -def get_explanation( - #drug_id: Optional[str] = None, - disease_id: Optional[str] = 'OMIM:246300', - #model_id: str ='openpredict_baseline', - n_results: int = 100 - ) -> dict: - """Get explanations for a given entity CURIE disease and predicted drugs. - - :param entity: Get explanations associations for this entity CURIE - :return: Prediction results with shap values for all features in the ML model with score - """ - time_start = datetime.now() - #return ('test: provide a drugid or diseaseid', 400) - # TODO: if drug_id and disease_id defined, then check if the disease appear in the provided drug predictions - concept_id = '' - drug_id= None - model_id=None - min_score=None - max_score=None - if drug_id: - concept_id = drug_id - elif disease_id: - concept_id = disease_id - else: - return ('Bad request: provide a drugid or diseaseid', 400) - - try: - - prediction_json = get_explanations( - concept_id, model_id, min_score, max_score, n_results - ) - - except Exception as e: - print('Error processing ID ' + concept_id) - print(e) - print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(e).__name__, e) - - return ('Not found: entry in OpenPredict for ID ' + concept_id, 404) - - print('PredictRuntime: ' + str(datetime.now() - time_start)) - return {'hits': prediction_json, 'count': len(prediction_json)} - - - @api.get("/features", name="Return the features trained in the models", description="""Return the features trained in the model, for Drugs, Diseases or Both.""", response_model=dict, @@ -188,41 +126,100 @@ def get_models() -> dict: return retrieve_models(models_g) - -@api.post("/embedding", name="Upload your embedding for drugs or diseases", - description="""Upload your embedding file: - -1. Select which types do you have in the embeddings: Drugs, Diseases or Both. - -2. Define the base `model_id`: use the `/models` call to see the list of trained models with their characteristics, and pick the ID of the model you will use as base to add your embedding - -3. The model will be retrained and evaluation will be stored in a triplestore (available in `/models`) -""", - response_model=dict, - tags=["openpredict"], -) -def post_embedding( - emb_name: str, description: str, - types: EmbeddingTypes ='Both', model_id: str ='openpredict_baseline', - apikey: str=None, - uploaded_file: UploadFile = File(...) - ) -> dict: - """Post JSON embeddings via the API, with simple APIKEY authentication - provided in environment variables - """ - if type(types) is EmbeddingTypes: - types = types.value - - # Ignore the API key check if no env variable defined (for development) - if os.getenv('OPENPREDICT_APIKEY') == apikey or os.getenv('OPENPREDICT_APIKEY') is None: - embedding_file = uploaded_file.file - run_id, loaded_model = add_embedding( - embedding_file, emb_name, types, model_id) - print('Embeddings uploaded') - # train_model(False) - return { - 'status': 200, - 'message': 'Embeddings added for run ' + run_id + ', trained model has scores ' + str(loaded_model.scores) - } - else: - return {'Forbidden': 403} +# from openpredict_model.explain_shap.explain_shap import get_explanations +# from openpredict_model.train import add_embedding + +# @api.get("/explain-shap", name="Get calculated shap explanations for predicted drug for a given disease", +# description="""Return the explanations for predicted entities for a given disease with SHAP values for feature importances: drug (DrugBank ID) or disease (OMIM ID), with confidence scores. +# a disease_id can be provided, +# This operation is annotated with x-bte-kgs-operations, and follow the BioThings API recommendations. + +# You can try: + +# | disease_id: `OMIM:246300` | + +# | to check the drug prediction explanations for a disease | +# """, +# response_model=dict, +# tags=["openpredict"], +# ) +# def get_explanation( +# #drug_id: Optional[str] = None, +# disease_id: Optional[str] = 'OMIM:246300', +# #model_id: str ='openpredict_baseline', +# n_results: int = 100 +# ) -> dict: +# """Get explanations for a given entity CURIE disease and predicted drugs. + +# :param entity: Get explanations associations for this entity CURIE +# :return: Prediction results with shap values for all features in the ML model with score +# """ +# time_start = datetime.now() +# #return ('test: provide a drugid or diseaseid', 400) +# # TODO: if drug_id and disease_id defined, then check if the disease appear in the provided drug predictions +# concept_id = '' +# drug_id= None +# model_id=None +# min_score=None +# max_score=None +# if drug_id: +# concept_id = drug_id +# elif disease_id: +# concept_id = disease_id +# else: +# return ('Bad request: provide a drugid or diseaseid', 400) + +# try: + +# prediction_json = get_explanations( +# concept_id, model_id, min_score, max_score, n_results +# ) + +# except Exception as e: +# print('Error processing ID ' + concept_id) +# print(e) +# print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(e).__name__, e) + +# return ('Not found: entry in OpenPredict for ID ' + concept_id, 404) + +# print('PredictRuntime: ' + str(datetime.now() - time_start)) +# return {'hits': prediction_json, 'count': len(prediction_json)} + + +# @api.post("/embedding", name="Upload your embedding for drugs or diseases", +# description="""Upload your embedding file: + +# 1. Select which types do you have in the embeddings: Drugs, Diseases or Both. + +# 2. Define the base `model_id`: use the `/models` call to see the list of trained models with their characteristics, and pick the ID of the model you will use as base to add your embedding + +# 3. The model will be retrained and evaluation will be stored in a triplestore (available in `/models`) +# """, +# response_model=dict, +# tags=["openpredict"], +# ) +# def post_embedding( +# emb_name: str, description: str, +# types: EmbeddingTypes ='Both', model_id: str ='openpredict_baseline', +# apikey: str=None, +# uploaded_file: UploadFile = File(...) +# ) -> dict: +# """Post JSON embeddings via the API, with simple APIKEY authentication +# provided in environment variables +# """ +# if type(types) is EmbeddingTypes: +# types = types.value + +# # Ignore the API key check if no env variable defined (for development) +# if os.getenv('OPENPREDICT_APIKEY') == apikey or os.getenv('OPENPREDICT_APIKEY') is None: +# embedding_file = uploaded_file.file +# run_id, loaded_model = add_embedding( +# embedding_file, emb_name, types, model_id) +# print('Embeddings uploaded') +# # train_model(False) +# return { +# 'status': 200, +# 'message': 'Embeddings added for run ' + run_id + ', trained model has scores ' + str(loaded_model.scores) +# } +# else: +# return {'Forbidden': 403} diff --git a/src/openpredict_model/predict.py b/src/openpredict_model/predict.py index d4c1881..51619b1 100644 --- a/src/openpredict_model/predict.py +++ b/src/openpredict_model/predict.py @@ -6,8 +6,7 @@ import pandas as pd from trapi_predict_kit import load, PredictInput, PredictOutput, trapi_predict, get_entities_labels, get_entity_types, log -from openpredict_model.train import createFeaturesSparkOrDF -from openpredict_model.utils import load_features_embeddings, load_similarity_embeddings, get_openpredict_dir, resolve_ids_with_nodenormalization_api, resolve_id +from openpredict_model.utils import load_features_embeddings, load_similarity_embeddings, get_openpredict_dir, resolve_ids_with_nodenormalization_api, createFeaturesSparkOrDF trapi_nodes = { "biolink:Disease": { diff --git a/src/openpredict_model/train.py b/src/openpredict_model/train.py index 6ea3eac..cac8b38 100644 --- a/src/openpredict_model/train.py +++ b/src/openpredict_model/train.py @@ -15,7 +15,7 @@ from trapi_predict_kit import save, log -from openpredict_model.utils import get_run_id, get_openpredict_dir +from openpredict_model.utils import get_run_id, get_openpredict_dir, createFeatureDF, geometricMean, createFeaturesSparkOrDF cli = typer.Typer(help="Training for OpenPredict model") @@ -340,26 +340,6 @@ def balance_data(pairs, classes, n_proportion): return pairs, classes -# @is_fairstep(label='Compute the geometric means of a drug-disease association using previously generated dataframes', is_script_task=True) -def geometricMean(drug, disease, knownDrugDisease, drugDF, diseaseDF): - """Compute the geometric means of a drug-disease association using previously generated dataframes - - :param drug: Drug - :param disease: Disease - :param knownDrugDisease: Known drug-disease associations - :param drugDF: Drug dataframe - :param diseaseDF: Disease dataframe - """ - a = drugDF.loc[knownDrugDisease[:, 0]][drug].values - b = diseaseDF.loc[knownDrugDisease[:, 1]][disease].values - c = np.sqrt(np.multiply(a, b)) - ix2 = (knownDrugDisease == [drug, disease]) - c[ix2[:, 1] & ix2[:, 0]] = 0.0 - if len(c) == 0: - return 0.0 - return float(max(c)) - - # @is_fairstep(label='Create the features dataframes for Spark', is_script_task=True) def createFeatureArray(drug, disease, knownDrugDisease, drugDFs, diseaseDFs): """Create the features dataframes for Spark. @@ -414,32 +394,6 @@ def sparkBuildFeatures(sc, pairs, classes, knownDrugDis, drug_df, disease_df): return df -# @is_fairstep(label='Create the features dataframes', is_script_task=True) -def createFeatureDF(pairs, classes, knownDrugDisease, drugDFs, diseaseDFs): - """Create the features dataframes. - - :param pairs: Generated pairs - :param classes: Classes corresponding to the pairs - :param knownDrugDisease: Known drug-disease associations - :param drugDFs: Drug dataframes - :param diseaseDFs: Disease dataframes - :return: The features dataframe - """ - len(drugDFs)*len(diseaseDFs) - # featureMatri x= np.empty((len(classes),totalNumFeatures), float) - df = pd.DataFrame(list(zip(pairs[:, 0], pairs[:, 1], classes)), columns=[ - 'Drug', 'Disease', 'Class']) - for _i, drug_col in enumerate(drugDFs.columns.levels[0]): - for _j, disease_col in enumerate(diseaseDFs.columns.levels[0]): - drugDF = drugDFs[drug_col] - diseaseDF = diseaseDFs[disease_col] - feature_series = df.apply(lambda row: geometricMean( - row.Drug, row.Disease, knownDrugDisease, drugDF, diseaseDF), axis=1) - # print (feature_series) - df["Feature_"+str(drug_col)+'_'+str(disease_col)] = feature_series - return df - - # @is_fairstep(label='Compute combined similarities', is_script_task=True) def calculateCombinedSimilarity(pairs_train, pairs_test, classes_train, classes_test, drug_df, disease_df, knownDrugDisease): """Compute combined similarities. Use Spark if available for speed, otherwise use pandas @@ -553,30 +507,6 @@ def evaluate(test_df, clf): return scores -# @is_fairstep(label='Create features dataframes', is_script_task=True) -def createFeaturesSparkOrDF(pairs, classes, drug_df, disease_df): - """Create features dataframes. Use Spark if available for speed, otherwise use pandas - :param pairs: pairs - :param classes: classes - :param drug_df: drug - :param disease_df: disease dataframe - :return: Feature dataframe - """ - spark_context = get_spark_context() - if spark_context: - log.info('Running Spark ✨') - drug_df_bc = spark_context.broadcast(drug_df) - disease_df_bc = spark_context.broadcast(disease_df) - knownDrugDis_bc = spark_context.broadcast(pairs[classes == 1]) - feature_df = sparkBuildFeatures( - spark_context, pairs, classes, knownDrugDis_bc.value, drug_df_bc.value, disease_df_bc.value) - log.info("Finishing Spark jobs 🏁") - spark_context.stop() - else: - log.info("Spark cluster not found, using pandas 🐼") - feature_df = createFeatureDF( - pairs, classes, pairs[classes == 1], drug_df, disease_df) - return feature_df diff --git a/src/openpredict_model/utils.py b/src/openpredict_model/utils.py index dca1778..b85c6f0 100644 --- a/src/openpredict_model/utils.py +++ b/src/openpredict_model/utils.py @@ -4,6 +4,7 @@ import uuid import pandas as pd +import numpy as np import requests from gensim.models import KeyedVectors from rdflib import Graph @@ -86,6 +87,74 @@ def load_similarity_embeddings(model_id: str = default_model_id): return emb_vectors +def createFeaturesSparkOrDF(pairs, classes, drug_df, disease_df): + """Create features dataframes. Use Spark if available for speed, otherwise use pandas + :param pairs: pairs + :param classes: classes + :param drug_df: drug + :param disease_df: disease dataframe + :return: Feature dataframe + """ + # spark_context = get_spark_context() + # if spark_context: + # log.info('Running Spark ✨') + # drug_df_bc = spark_context.broadcast(drug_df) + # disease_df_bc = spark_context.broadcast(disease_df) + # knownDrugDis_bc = spark_context.broadcast(pairs[classes == 1]) + # feature_df = sparkBuildFeatures( + # spark_context, pairs, classes, knownDrugDis_bc.value, drug_df_bc.value, disease_df_bc.value) + # log.info("Finishing Spark jobs 🏁") + # spark_context.stop() + # else: + feature_df = createFeatureDF( + pairs, classes, pairs[classes == 1], drug_df, disease_df) + return feature_df + +def createFeatureDF(pairs, classes, knownDrugDisease, drugDFs, diseaseDFs): + """Create the features dataframes. + + :param pairs: Generated pairs + :param classes: Classes corresponding to the pairs + :param knownDrugDisease: Known drug-disease associations + :param drugDFs: Drug dataframes + :param diseaseDFs: Disease dataframes + :return: The features dataframe + """ + len(drugDFs)*len(diseaseDFs) + # featureMatri x= np.empty((len(classes),totalNumFeatures), float) + df = pd.DataFrame(list(zip(pairs[:, 0], pairs[:, 1], classes)), columns=[ + 'Drug', 'Disease', 'Class']) + for _i, drug_col in enumerate(drugDFs.columns.levels[0]): + for _j, disease_col in enumerate(diseaseDFs.columns.levels[0]): + drugDF = drugDFs[drug_col] + diseaseDF = diseaseDFs[disease_col] + feature_series = df.apply(lambda row: geometricMean( + row.Drug, row.Disease, knownDrugDisease, drugDF, diseaseDF), axis=1) + # print (feature_series) + df["Feature_"+str(drug_col)+'_'+str(disease_col)] = feature_series + return df + +def geometricMean(drug, disease, knownDrugDisease, drugDF, diseaseDF): + """Compute the geometric means of a drug-disease association using previously generated dataframes + + :param drug: Drug + :param disease: Disease + :param knownDrugDisease: Known drug-disease associations + :param drugDF: Drug dataframe + :param diseaseDF: Disease dataframe + """ + a = drugDF.loc[knownDrugDisease[:, 0]][drug].values + b = diseaseDF.loc[knownDrugDisease[:, 1]][disease].values + c = np.sqrt(np.multiply(a, b)) + ix2 = (knownDrugDisease == [drug, disease]) + c[ix2[:, 1] & ix2[:, 0]] = 0.0 + if len(c) == 0: + return 0.0 + return float(max(c)) + + + + # TODO: not used MISSING_IDS = set() def convert_baseline_features_ids(): diff --git a/src/trapi/main.py b/src/trapi/main.py index b9cf3a2..3f8bffb 100644 --- a/src/trapi/main.py +++ b/src/trapi/main.py @@ -1,9 +1,10 @@ import logging import os +from predict_drug_target import get_drug_target_predictions from trapi_predict_kit import settings, TRAPI -from drkg_model.api import api as drkg_model_api +# from drkg_model.api import api as drkg_model_api from openpredict_model.api import api as openpredict_api from openpredict_model.predict import get_predictions, get_similarities from openpredict_model.utils import get_openpredict_dir @@ -22,9 +23,8 @@ openapi_info = { "contact": { - "name": "Vincent Emonet", - "email": "vincent.emonet@maastrichtuniversity.nl", - # "x-id": "vemonet", + "name": "Michel Dumontier", + "email": "michel.dumontier@maastrichtuniversity.nl", "x-role": "responsible developer", }, "license": { @@ -64,6 +64,7 @@ predict_endpoints=[ get_predictions, get_similarities, + get_drug_target_predictions, ], info=openapi_info, itrb_url_prefix=itrb_url_prefix, @@ -80,6 +81,7 @@ \n\nService supported by the [NCATS Translator project](https://ncats.nih.gov/translator/about)""", trapi_description="""The default example TRAPI query will give you a list of predicted potential drug treatments for a given disease + You can also try this query to retrieve similar entities for a given drug: ```json @@ -106,9 +108,31 @@ }, "query_options": { "n_results": 5 } } +``` + +Or this TRAPI query to get drug-target predictions: + +```json +{ + "message": { + "query_graph": { + "edges": {"e01": {"object": "n1", "predicates": ["biolink:interacts_with"], "subject": "n0"}}, + "nodes": { + "n0": { + "categories": ["biolink:Drug"], + "ids": ["PUBCHEM.COMPOUND:5329102", "PUBCHEM.COMPOUND:4039", "CHEMBL.COMPOUND:CHEMBL1431"]}, + "n1": { + "categories": ["biolink:Protein"], + "ids": ["UniProtKB:O75251"] + } + } + } + }, + "query_options": {"max_score": 1, "min_score": 0.1, "n_results": 10} +} ``` """, ) app.include_router(openpredict_api) -app.include_router(drkg_model_api) +# app.include_router(drkg_model_api)