diff --git a/app.py b/app.py index fb2ebc69..32a69f2f 100644 --- a/app.py +++ b/app.py @@ -29,7 +29,6 @@ from fast_api.routes.heuristic import router as heuristic_router from fast_api.routes.data_browser import router as data_browser_router from fast_api.routes.labeling import router as labeling_router -from fast_api.routes.record_ide import router as record_ide_router from fast_api.routes.record import router as record_router from fast_api.routes.weak_supervision import router as weak_supervision_router from fast_api.routes.labeling_tasks import router as labeling_tasks_router @@ -54,7 +53,6 @@ PREFIX_HEURISTIC, PREFIX_DATA_BROWSER, PREFIX_LABELING, - PREFIX_RECORD_IDE, PREFIX_RECORD, PREFIX_WEAK_SUPERVISION, PREFIX_LABELING_TASKS, @@ -100,9 +98,6 @@ data_browser_router, prefix=PREFIX_DATA_BROWSER, tags=["data-browser"] ) fastapi_app.include_router(labeling_router, prefix=PREFIX_LABELING, tags=["labeling"]) -fastapi_app.include_router( - record_ide_router, prefix=PREFIX_RECORD_IDE, tags=["record-ide"] -), fastapi_app.include_router(record_router, prefix=PREFIX_RECORD, tags=["record"]), fastapi_app.include_router( weak_supervision_router, prefix=PREFIX_WEAK_SUPERVISION, tags=["weak-supervision"] diff --git a/controller/project/manager.py b/controller/project/manager.py index 7afb7f69..ff440877 100644 --- a/controller/project/manager.py +++ b/controller/project/manager.py @@ -8,7 +8,6 @@ from controller.labeling_access_link import manager as link_manager from submodules.model import Project, enums from submodules.model.business_objects import ( - labeling_task, organization, project, record, @@ -42,10 +41,6 @@ def get_project(project_id: str) -> Project: return project.get(project_id) -def get_project_with_labeling_tasks(project_id: str) -> Project: - return project.get_with_labling_tasks(project_id) - - def get_project_with_labeling_tasks_info_attributes(project_id: str) -> Project: return project.get_with_labling_tasks_info_attributes(project_id) @@ -109,10 +104,6 @@ def get_max_running_id(project_id: str) -> int: return project.get_max_running_id(project_id) -def is_rats_tokenization_still_running(project_id: str) -> bool: - return project.is_rats_tokenization_still_running(project_id) - - def create_project( organization_id: str, name: str, description: str, user_id: str ) -> Project: @@ -202,31 +193,6 @@ def get_label_distribution( return project.get_label_distribution(project_id, labeling_task_id, slice_id) -def get_confidence_distribution( - project_id: str, - labeling_task_id: str, - slice_id: Optional[str] = None, - num_samples: Optional[int] = None, -) -> str: - return project.get_confidence_distribution( - project_id, labeling_task_id, slice_id, num_samples - ) - - -def get_confusion_matrix( - project_id: str, - labeling_task_id: str, - slice_id: Optional[str] = None, -) -> str: - for_classification = ( - labeling_task.get(project_id, labeling_task_id).task_type - == enums.LabelingTaskType.CLASSIFICATION.value - ) - return project.get_confusion_matrix( - project_id, labeling_task_id, for_classification, slice_id - ) - - def resolve_request_huddle_data( project_id: str, user_id: str, data_id: str, huddle_type: str ) -> HuddleData: diff --git a/controller/record_ide/__init__.py b/controller/record_ide/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/controller/record_ide/manager.py b/controller/record_ide/manager.py deleted file mode 100644 index faa69791..00000000 --- a/controller/record_ide/manager.py +++ /dev/null @@ -1,7 +0,0 @@ -from typing import List - -from util.record_ide import run_record_ide - - -def create_record_ide_payload(user_id: str, project_id: str, record_id: str, code: str) -> List[str]: - return run_record_ide(user_id, project_id, record_id, code) diff --git a/fast_api/models.py b/fast_api/models.py index bbd54529..5d1f5387 100644 --- a/fast_api/models.py +++ b/fast_api/models.py @@ -107,10 +107,6 @@ class UploadCredentialsAndIdBody(BaseModel): key: Optional[StrictStr] = None -class RecordIdeBody(BaseModel): - code: StrictStr - - class NotificationsBody(BaseModel): project_filter: List[StrictStr] level_filter: List[StrictStr] diff --git a/fast_api/routes/project.py b/fast_api/routes/project.py index 511537f9..ad1a5ccd 100644 --- a/fast_api/routes/project.py +++ b/fast_api/routes/project.py @@ -11,15 +11,13 @@ ) from fastapi import APIRouter, Body, Depends, Request from fast_api.routes.client_response import pack_json_result -from typing import Dict, List +from typing import Dict from controller.auth import manager as auth_manager from controller.attribute import manager as attr_manager -from controller.labeling_task import manager as task_manager from controller.upload_task import manager as upload_task_manager from submodules.model.business_objects import information_source, labeling_task -from submodules.model import enums, events +from submodules.model import enums from submodules.model.business_objects.embedding import get_all_embeddings_by_project_id -from submodules.model.enums import LabelingTaskType from submodules.model.business_objects.project import get_project_by_project_id_sql from submodules.model.business_objects.labeling_task import ( get_labeling_tasks_by_project_id_full, @@ -33,10 +31,6 @@ to_frontend_obj_raw, ) from util import notification -from util.inter_annotator.functions import ( - resolve_inter_annotator_matrix_classification, - resolve_inter_annotator_matrix_extraction, -) from controller.misc import manager as misc from exceptions.exceptions import NotAllowedInOpenSourceError from submodules.model.business_objects import notification as notification_model @@ -137,87 +131,6 @@ def general_project_stats( ) -@router.get( - "/{project_id}/inter-annotator-matrix", - dependencies=[Depends(auth_manager.check_project_access_dep)], -) -def inter_annotator_matrix( - project_id: str, - labeling_task_id: str, - include_gold_star: Optional[bool] = True, - include_all_org_user: Optional[bool] = False, - only_on_static_slice: Optional[str] = None, -) -> Dict: - - labeling_task = task_manager.get_labeling_task(project_id, labeling_task_id) - if not labeling_task: - raise ValueError("Can't match labeling task to given Ids") - fp = None - if labeling_task.task_type == LabelingTaskType.CLASSIFICATION.value: - fp = resolve_inter_annotator_matrix_classification - elif labeling_task.task_type == LabelingTaskType.INFORMATION_EXTRACTION.value: - fp = resolve_inter_annotator_matrix_extraction - else: - raise ValueError(f"Can't match task type {labeling_task.task_type}") - - return pack_json_result( - { - "data": { - "interAnnotatorMatrix": fp( - labeling_task, - include_gold_star, - include_all_org_user, - only_on_static_slice, - ) - } - }, - wrap_for_frontend=False, # not wrapped as the prepared results in snake_case are still the expected form the frontend - ) - - -@router.get( - "/{project_id}/confusion-matrix", - dependencies=[Depends(auth_manager.check_project_access_dep)], -) -def confusion_matrix( - project_id: str, - labeling_task_id: str, - slice_id: Optional[str] = None, -) -> Dict: - return pack_json_result( - { - "data": { - "confusionMatrix": manager.get_confusion_matrix( - project_id, labeling_task_id, slice_id - ) - } - }, - wrap_for_frontend=False, # not wrapped as the prepared results in snake_case are still the expected form the frontend - ) - - -@router.get( - "/{project_id}/confidence-distribution", - dependencies=[Depends(auth_manager.check_project_access_dep)], -) -def confidence_distribution( - project_id: str, - labeling_task_id: Optional[str] = None, - slice_id: Optional[str] = None, - num_samples: int = 100, -) -> List: - return pack_json_result( - { - "data": { - "confidenceDistribution": manager.get_confidence_distribution( - project_id, labeling_task_id, slice_id, num_samples - ) - } - }, - wrap_for_frontend=False, # not wrapped as the prepared results in snake_case are still the expected form the frontend - ) - - @router.get( "/{project_id}/label-distribution", dependencies=[Depends(auth_manager.check_project_access_dep)], @@ -386,19 +299,6 @@ def get_model_provider_info(request: Request) -> Dict: return pack_json_result({"data": {"modelProviderInfo": data}}) -@router.get( - "/{project_id}/rats-running", - dependencies=[Depends(auth_manager.check_project_access_dep)], -) -def is_rats_running( - request: Request, - project_id: str, -) -> Dict: - - data = manager.is_rats_tokenization_still_running(project_id) - return pack_json_result({"data": {"isRatsTokenizationStillRunning": data}}) - - @router.get( "/{project_id}/last-export-credentials", dependencies=[Depends(auth_manager.check_project_access_dep)], diff --git a/fast_api/routes/record_ide.py b/fast_api/routes/record_ide.py deleted file mode 100644 index 4f653432..00000000 --- a/fast_api/routes/record_ide.py +++ /dev/null @@ -1,30 +0,0 @@ -from fastapi import APIRouter, Depends, Request, Body -from controller.record_ide import manager -from controller.auth import manager as auth_manager -from fast_api.models import RecordIdeBody -from fast_api.routes.client_response import pack_json_result - -router = APIRouter() - - -@router.post( - "/{project_id}/{record_id}/record-ide", - dependencies=[Depends(auth_manager.check_project_access_dep)], -) -def get_record_ide( - request: Request, - project_id: str, - record_id: str, - record_ide_body: RecordIdeBody = Body(...), -): - - user_id = auth_manager.get_user_by_info(request.state.info).id - return pack_json_result( - { - "data": { - "runRecordIde": manager.create_record_ide_payload( - user_id, project_id, record_id, record_ide_body.code - ) - } - } - ) diff --git a/route_prefix.py b/route_prefix.py index f14cc739..b519c320 100644 --- a/route_prefix.py +++ b/route_prefix.py @@ -12,7 +12,6 @@ PREFIX_HEURISTIC = PREFIX + "/heuristic" PREFIX_DATA_BROWSER = PREFIX + "/data-browser" PREFIX_LABELING = PREFIX + "/labeling" -PREFIX_RECORD_IDE = PREFIX + "/record-ide" PREFIX_RECORD = PREFIX + "/record" PREFIX_WEAK_SUPERVISION = PREFIX + "/weak-supervision" PREFIX_LABELING_TASKS = PREFIX + "/labeling-tasks" diff --git a/start b/start index 486b7016..6bda6bf9 100755 --- a/start +++ b/start @@ -54,7 +54,6 @@ docker run -d --rm \ -e AC_EXEC_ENV_IMAGE=registry.dev.kern.ai/code-kern-ai/refinery-ac-exec-env:dev$IS_ARM64 \ -e LF_EXEC_ENV_IMAGE=registry.dev.kern.ai/code-kern-ai/refinery-lf-exec-env:dev$IS_ARM64 \ -e ML_EXEC_ENV_IMAGE=registry.dev.kern.ai/code-kern-ai/refinery-ml-exec-env:dev$IS_ARM64 \ --e RECORD_IDE_IMAGE=registry.dev.kern.ai/code-kern-ai/refinery-record-ide-env:dev$IS_ARM64 \ -e LF_NETWORK=dev-setup_default \ -e S3_ENDPOINT="http://$HOST_IP:7053" \ -e S3_ENDPOINT_LOCAL=object-storage:9000 \ diff --git a/submodules/model b/submodules/model index afe67cc3..20190575 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit afe67cc3b6ecaa79fa5582d44ea137cfa61545a4 +Subproject commit 201905750781aefb9dcca53a5389144e73edd775 diff --git a/util/inter_annotator/functions.py b/util/inter_annotator/functions.py deleted file mode 100644 index 90e5abda..00000000 --- a/util/inter_annotator/functions.py +++ /dev/null @@ -1,238 +0,0 @@ -from typing import Dict, Any -from controller.auth import kratos -from submodules.model import models -from submodules.model.business_objects import data_slice -from submodules.model import enums -from submodules.model.business_objects.inter_annotator import ( - check_inter_annotator_classification_records_only_used_once, - get_current_inter_annotator_classification_users, - get_all_inter_annotator_classification_users, - get_classification_user_by_user_label_count, - get_extraction_user_max_lookup, - get_inter_annotator_extraction_users, - get_extraction_user_by_user_label_count, -) - - -def resolve_inter_annotator_matrix_classification( - labeling_task: models.LabelingTask, - include_gold_star: bool, - include_all_org_user: bool, - static_slice_id: str, -): - project_id = str(labeling_task.project_id) - labeling_task_id = str(labeling_task.id) - - __run_checks_inter_annotator_classification( - project_id, labeling_task_id, static_slice_id - ) - - all_users = [] - if include_all_org_user: - users = __get_all_inter_annotator_classification_users( - project_id, labeling_task_id, static_slice_id - ) - else: - users = __get_current_inter_annotator_classification_users( - project_id, labeling_task_id, static_slice_id - ) - if ( - not include_gold_star - and enums.InterAnnotatorConstants.ID_GOLD_USER.value in users - ): - del users[enums.InterAnnotatorConstants.ID_GOLD_USER.value] - elif ( - include_gold_star - and enums.InterAnnotatorConstants.ID_GOLD_USER.value not in users - ): - users[enums.InterAnnotatorConstants.ID_GOLD_USER.value] = 0 - - all_users = kratos.resolve_all_user_ids(list(users.keys())) - all_users = [{"user": user, "count": users[user["id"]]} for user in all_users] - all_users.sort( - key=lambda x: ( - x["user"]["id"] - if x["user"]["id"] != enums.InterAnnotatorConstants.ID_GOLD_USER.value - else "zzz" - ) - ) - - elements = [] - count_lookup = __get_classification_user_by_user_label_count( - project_id, labeling_task_id, static_slice_id - ) - - for userA in users: - for userB in users: - if userA == userB: - percent = 1 - else: - percent = count_lookup.get(userA + "@" + userB) - if percent is None: - percent = -1 - elements.append( - {"userIdA": userA, "userIdB": userB, "percent": float(percent)} - ) - return { - "allUsers": all_users, - "countNames": len(all_users), - "elements": elements, - } - - -def __run_checks_inter_annotator_classification( - project_id: str, labeling_task_id: str, slice_id: str -) -> None: - __check_classification_records_only_used_once( - project_id, labeling_task_id, slice_id - ) - if slice_id: - __check_slice_id_valid(project_id, slice_id) - - -def __check_slice_id_valid(project_id: str, slice_id: str) -> None: - data_slice_item = data_slice.get(project_id, slice_id, True) - if not data_slice_item: - raise ValueError(f"Can't find static data slice with id {slice_id}") - - -def __check_classification_records_only_used_once( - project_id: str, labeling_task_id: str, slice_id: str -) -> None: - result = check_inter_annotator_classification_records_only_used_once( - project_id, labeling_task_id, slice_id - ) - if result.count > 0 and result.sum != result.count: - raise ValueError( - f"Project: {project_id}, task {labeling_task_id} has a missmatch in user / classification amount" - ) - - -def __get_current_inter_annotator_classification_users( - project_id: str, labeling_task_id: str, slice_id: str -) -> Dict[str, int]: - result = get_current_inter_annotator_classification_users( - project_id, labeling_task_id, slice_id - ) - return {x.user_id: x.distinct_records for x in result} - - -def __get_all_inter_annotator_classification_users( - project_id: str, labeling_task_id: str, slice_id: str -) -> Dict[str, int]: - result = get_all_inter_annotator_classification_users( - project_id, labeling_task_id, slice_id - ) - - return {x.user_id: x.distinct_records for x in result} - - -def __get_classification_user_by_user_label_count( - project_id: str, labeling_task_id: str, slice_id: str -) -> Dict[str, float]: - result = get_classification_user_by_user_label_count( - project_id, labeling_task_id, slice_id - ) - - return {x.user_lookup: x.percent for x in result} - - -def resolve_inter_annotator_matrix_extraction( - labeling_task: models.LabelingTask, - include_gold_star: bool, - include_all_org_user: bool, - static_slice_id: str, -): - project_id = str(labeling_task.project_id) - labeling_task_id = str(labeling_task.id) - - __run_checks_inter_annotator_extraction(project_id, static_slice_id) - - all_users = [] - users = __get_inter_annotator_extraction_users( - project_id, labeling_task_id, static_slice_id, include_all_org_user - ) - if ( - not include_gold_star - and enums.InterAnnotatorConstants.ID_GOLD_USER.value in users - ): - del users[enums.InterAnnotatorConstants.ID_GOLD_USER.value] - elif ( - include_gold_star - and enums.InterAnnotatorConstants.ID_GOLD_USER.value not in users - ): - users[enums.InterAnnotatorConstants.ID_GOLD_USER.value] = 0 - - all_users = kratos.resolve_all_user_ids(list(users.keys())) - all_users = [{"user": user, "count": users[user["id"]]} for user in all_users] - all_users.sort( - key=lambda x: ( - x["user"]["id"] - if x["user"]["id"] != enums.InterAnnotatorConstants.ID_GOLD_USER.value - else "zzz" - ) - ) - - elements = [] - max_lookup = __get_extraction_user_max_lookup( - project_id, labeling_task_id, static_slice_id - ) - count_lookup = __get_extraction_user_by_user_label_count( - project_id, labeling_task_id, static_slice_id - ) - - for userA in users: - for userB in users: - if userA == userB: - percent = 1 - else: - user_lookup = userA + "@" + userB - amount = count_lookup.get(user_lookup) - if amount is None: - amount = 0 - full_count = max_lookup.get(user_lookup) - if full_count is None or full_count == 0: - percent = -1 - else: - percent = round(amount / full_count, 4) - elements.append( - {"user_id_a": userA, "user_id_b": userB, "percent": percent} - ) - return { - "allUsers": all_users, - "countNames": len(all_users), - "elements": elements, - } - - -def __run_checks_inter_annotator_extraction(project_id: str, slice_id: str) -> None: - if slice_id: - __check_slice_id_valid(project_id, slice_id) - - -def __get_inter_annotator_extraction_users( - project_id: str, labeling_task_id: str, slice_id: str, all_user: bool -) -> Dict[str, int]: - result = get_inter_annotator_extraction_users( - project_id, labeling_task_id, slice_id, all_user - ) - - return {x.user_id: x.distinct_records for x in result} - - -def __get_extraction_user_max_lookup( - project_id: str, labeling_task_id: str, slice_id: str -) -> Dict[str, Any]: - result = get_extraction_user_max_lookup(project_id, labeling_task_id, slice_id) - - return {x.user_lookup: x.possible_matches for x in result} - - -def __get_extraction_user_by_user_label_count( - project_id: str, labeling_task_id: str, slice_id: str -) -> Dict[str, int]: - result = get_extraction_user_by_user_label_count( - project_id, labeling_task_id, slice_id - ) - - return {x.user_lookup: x.count_same for x in result} diff --git a/util/record_ide.py b/util/record_ide.py deleted file mode 100644 index 2e3905b1..00000000 --- a/util/record_ide.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -from typing import Any, List - -from controller.knowledge_base import util as knowledge_base -import docker -from controller.tokenization import manager as tokenization_manager -import pickle -import tarfile -from submodules.model.business_objects import record -from submodules.model.business_objects.record import get_tokenized_record_from_db -import time -import uuid - -from submodules.model import daemon - -client = docker.from_env() -image = os.getenv("RECORD_IDE_IMAGE") -exec_env_network = os.getenv("LF_NETWORK") - -__containers_running = {} - - -def copy_to(src: str, dst: str, tar_path: str) -> None: - # https://stackoverflow.com/questions/46390309/how-to-copy-a-file-from-host-to-container-using-docker-py-docker-sdk - name, dst = dst.split(":") - container = client.containers.get(name) - - os.chdir(os.path.dirname(src)) - srcname = os.path.basename(src) - with tarfile.open(tar_path, "w") as tar: - try: - tar.add(srcname) - finally: - tar.close() - - with open(tar_path, "rb") as file: - container.put_archive(path="/", data=file) - - -def run_record_ide( - user_id: str, project_id: str, record_id: str, code: str -) -> List[str]: - record_bytes_path = pack_record_data(project_id, record_id) - knowledge_base_bytes_path = pack_knowledge_base(project_id) - - command = [code, record_bytes_path, knowledge_base_bytes_path] - cpu_limit = docker.types.Ulimit(name="cpu", soft=50, hard=50) - container_name = str(uuid.uuid4()) - container = client.containers.create( - command=command, - name=container_name, - image=image, - detach=True, - network=exec_env_network, - ulimits=[cpu_limit], - ) - error = "" - try: - record_tar_path = f"{record_id}.tar" - knowledge_base_tar_path = f"{project_id}.tar" - - copy_to( - f"./{record_bytes_path}", - f"{container.name}:/{record_bytes_path}", - record_tar_path, - ) - copy_to( - f"./{knowledge_base_bytes_path}", - f"{container.name}:/{knowledge_base_bytes_path}", - knowledge_base_tar_path, - ) - daemon.run_without_db_token(cancel_container, container_name, container) - __containers_running[container_name] = True - container.start() - logs_arr = [ - line.decode("utf-8").strip("\n") - for line in container.logs( - stream=True, stdout=True, stderr=True, timestamps=False - ) - ] - logs = "\n".join(logs_arr) - if logs_arr: - last_log = logs_arr[-1] - if "Killed" in last_log and "/usr/local/bin/python run_ide.py" in last_log: - error = "cpu time" - - finally: - if not __containers_running[container_name]: - error = "run time" - else: - container.stop() - container.remove() - os.remove(record_bytes_path) - os.remove(record_tar_path) - os.remove(knowledge_base_bytes_path) - os.remove(knowledge_base_tar_path) - - if error: - logs += f"\n\nUnfortunatly the {error} was exceeded.\n\nIf this is not by mistake an infinite loop situation please contact our support." - del __containers_running[container_name] - return logs - - -def cancel_container(name: str, container: Any): - TIMEOUT = 60 - time.sleep(TIMEOUT) - if name in __containers_running and __containers_running[name]: - __containers_running[name] = False - container.stop() - print(f"Cancelled coontainer {name} after {TIMEOUT} sec", flush=True) - - -def container_exists(containers: Any, name: str) -> bool: - try: - return containers.get(name) is not None - except docker.errors.NotFound: - pass - return False - - -def pack_record_data(project_id: str, record_id: str) -> str: - tokenized_record = get_tokenized_record_from_db(project_id, record_id) - if not tokenized_record: - return None - used_columns = {value for value in tokenized_record.columns} - - full_data = tokenization_manager.__get_docs_from_db(project_id, record_id) - - record_data = record.get(project_id, record_id).data - for c in record_data: - if c not in used_columns: - full_data[c] = record_data[c] - - record_bytes_path = f"{record_id}record_bytes.p" - with open(record_bytes_path, "wb") as file: - pickle.dump(full_data, file) - return record_bytes_path - - -def pack_knowledge_base(project_id: str) -> str: - knowledge_base_source = knowledge_base.build_knowledge_base_from_project(project_id) - knowledge_base_path = f"{project_id}knowledge_base.p" - with open(knowledge_base_path, "wb") as file: - pickle.dump(knowledge_base_source, file) - return knowledge_base_path