marieai · Rithsek99 · Feb 15, 2024 · Feb 16, 2024 · Feb 20, 2024 · Feb 22, 2024
diff --git a/config/service/marie-extract.yml b/config/service/marie-extract.yml
@@ -0,0 +1,215 @@
+jtype: Flow
+version: '1'
+protocol: grpc
+
+# Shared configuration
+shared_config:
+  storage: &storage
+    psql: &psql_conf_shared
+      provider: postgresql
+      hostname: 127.0.0.1
+      port: 5432
+      username: postgres
+      password: 123456
+      database: postgres
+      default_table: shared_docs
+
+  message: &message
+    amazon_mq :  &amazon_mq_conf_shared
+      provider: amazon-rabbitmq
+      hostname: ${{ ENV.AWS_MQ_HOSTNAME }}
+      port: 15672
+      username: ${{ ENV.AWS_MQ_USERNAME }}
+      password: ${{ ENV.AWS_MQ_PASSWORD }}
+      tls: True
+      virtualhost: /
+
+
+    rabbitmq :  &rabbitmq_conf_shared
+      provider: rabbitmq
+      hostname: ${{ ENV.RABBIT_MQ_HOSTNAME }}
+      port: ${{ ENV.RABBIT_MQ_PORT }}
+      username: ${{ ENV.RABBIT_MQ_USERNAME }}
+      password: ${{ ENV.RABBIT_MQ_PASSWORD }}
+      tls: False
+      virtualhost: /
+
+
+# Toast event tracking system
+# It can be backed by Message Queue and Database backed
+toast:
+  native:
+    enabled: True
+    path: /tmp/marie/events.json
+  rabbitmq:
+    <<: *rabbitmq_conf_shared
+    enabled : True
+  psql:
+    <<: *psql_conf_shared
+    default_table: event_tracking
+    enabled : True
+
+# Document Storage
+# The storage service is used to store the data that is being processed
+# Storage can be backed by S3 compatible
+
+storage:
+  # S3 configuration. Will be used only if value of backend is "s3"
+  s3:
+    enabled: True
+    metadata_only: False # If True, only metadata will be stored in the storage backend
+    # api endpoint to connect to. use AWS S3 or any S3 compatible object storage endpoint.
+    endpoint_url: ${{ ENV.S3_ENDPOINT_URL }}
+    # optional.
+    # access key id when using static credentials.
+    access_key_id: ${{ ENV.S3_ACCESS_KEY_ID }}
+    # optional.
+    # secret key when using static credentials.
+    secret_access_key: ${{ ENV.S3_SECRET_ACCESS_KEY }}
+    # Bucket name in s3
+    bucket_name: ${{ ENV.S3_BUCKET_NAME }}
+    # optional.
+    # Example: "region: us-east-2"
+    region: ${{ ENV.S3_REGION }}
+    # optional.
+    # enable if endpoint is http
+    insecure: True
+    # optional.
+    # enable if you want to use path style requests
+    addressing_style: path
+
+  # postgresql configuration. Will be used only if value of backend is "psql"
+  psql:
+    <<: *psql_conf_shared
+    default_table: store_metadata
+    enabled : False
+
+# Job Queue scheduler
+scheduler:
+  psql:
+    <<: *psql_conf_shared
+    default_table: job_queue
+    enabled : True
+
+# FLOW / GATEWAY configuration
+
+with:
+  port:
+    - 51000
+    - 52000
+  protocol:
+    - http
+    - grpc
+  discovery: True
+  discovery_host: 127.0.0.1
+  discovery_port: 8500
+
+  host: 127.0.0.1
+
+  # monitoring
+  monitoring: true
+  port_monitoring: 57844
+
+  event_tracking: True
+
+  expose_endpoints:
+    /document/extract:
+      methods: ["POST"]
+      summary: Extract data-POC
+      tags:
+        - extract
+    /status:
+      methods: ["POST"]
+      summary: Status
+      tags:
+        - extract
+
+    /text/status:
+      methods: ["POST"]
+      summary: Extract data
+      tags:
+        - extract
+
+    /ner/extract:
+      methods: ["POST"]
+      summary: Extract NER
+      tags:
+        - ner
+
+    /document/classify:
+      methods: ["POST"]
+      summary: Classify document at page level
+      tags:
+        - classify
+
+prefetch: 4
+
+executors:
+  - name: extract_t
+    uses:
+      jtype: TextExtractionExecutor
+#      jtype: TextExtractionExecutorMock
+      with:
+        storage:
+          # postgresql configuration. Will be used only if value of backend is "psql"
+          psql:
+            <<: *psql_conf_shared
+            default_table: extract_metadata
+            enabled: True
+        pipelines:
+          - pipeline:
+              name: 'default'
+              default: True
+              page_classifier:
+                - model_name_or_path: 'marie/lmv3-medical-document-classification'
+                  type: 'transformers'
+                  task: 'text-classification-multimodal'
+                  device: 'cuda'
+                  enabled: True
+                  batch_size: 1
+                  name: 'medical_page_classifier'
+                  group: 'medical-page-classifier'
+
+                - model_name_or_path: 'marie/lmv3-medical-document-payer'
+                  type: 'transformers'
+                  task: 'text-classification-multimodal'
+                  enabled: True
+                  batch_size: 1
+                  device: 'cuda'
+                  name: 'medical_payer_classifier'
+                  group: 'medical-payer-classifier'
+
+              page_indexer:
+                - model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
+                  enabled: True
+                  type: 'transformers'
+                  device: 'cuda'
+                  name: 'page_indexer_patient'
+                  group: 'medical-payer-classifier'
+                  filter:
+                    type: 'regex'
+                    pattern: '.*'
+              page_splitter:
+                model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
+                enabled: True
+      metas:
+        py_modules:
+          - marie.executor.text
+    timeout_ready: 3000000
+    replicas: 1
+    #    replicas: ${{ CONTEXT.gpu_device_count }}
+    env:
+      CUDA_VISIBLE_DEVICES: RR
+# Authentication and Authorization configuration
+
+auth:
+  keys:
+    - name : service-A
+      api_key : mas_0aPJ9Q9nUO1Ac1vJTfffXEXs9FyGLf9BzfYgZ_RaHm707wmbfHJNPQ
+      enabled : True
+      roles : [admin, user]
+
+    - name : service-B
+      api_key : mau_t6qDi1BcL1NkLI8I6iM8z1va0nZP01UQ6LWecpbDz6mbxWgIIIZPfQ
+      enabled : True
+      roles : [admin, user]
diff --git a/config/tests-integration/pipeline-classify-004.partial.yml b/config/tests-integration/pipeline-classify-004.partial.yml
@@ -0,0 +1,129 @@
+pipelines:
+  - pipeline:
+      name: 'default' # name of the pipeline, used for logging and asset saving
+      default: True
+      id2label:
+        '0': additional_information
+        '1': attorney
+        '2': auth_approval
+        '3': auth_denial
+        '4': bankruptcy
+        '5': cms_letter
+        '6': dispute
+        '7': eligibility
+        '8': medical_certificate
+        '9': medical_record
+        '10': misc
+        '11': newborn
+        '12': noop_blank
+        '13': noop_check
+        '14': noop_cover
+        '15': noop_envelope
+        '16': noop_eob
+        '17': noop_hicfa
+        '18': noop_notice
+        '19': noop_patpay
+        '20': noop_w9
+        '21': pa_162
+        '22': referral
+        '23': refund_request
+        '24': tax_1099
+
+      page_classifier:
+        - model_name_or_path: 'rms/corr-layoutlmv3-classifier'
+          name: 'corr_page_classifier_layoutlmv3'
+          type: 'transformers'
+          task: 'text-classification-multimodal'
+          device: 'cuda'
+          enabled: True
+          group: 'corr-classifier'
+
+        - model_name_or_path: 'rms/corr-longformer-classifier'
+          task: 'text-classification'
+          name: 'corr_page_classifier_longformer'
+          type: 'transformers'
+          enabled: True
+          batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
+          device: 'cuda'
+          group: 'corr-classifier'
+        - model_name_or_path: 'rms/corr-layoutlmv3-classifier'
+          name: 'corr_page_classifier_layoutlmv3'
+          type: 'transformers'
+          task: 'text-classification-multimodal'
+          device: 'cuda'
+          enabled: True
+          group: 'jpmc-classifier'
+#        - model_name_or_path: 'rms/corr-payer-longformer-classifier'
+#          task: 'text-classification'
+#          name: 'corr_payer_longformer'
+#          type: 'transformers'
+#          enabled: True
+#          batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
+#          device: 'cuda'
+#          group: 'corr-payer-classifier'
+
+      sub_classifier:
+        - model_name_or_path: 'rms/corr-auth-longformer-classifier'
+          task: 'text-classification'
+          name: 'corr_auth_sub_classifier'
+          type: 'transformers'
+          enabled: True
+          batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
+          device: 'cuda'
+          group: 'corr-classifier'
+
+          id2label:
+            '0': auth_denial_in
+            '1': auth_denial_op
+          # Filter should be on the same level as the sub-classifier, for now this is just a global filter
+          filter:
+            type: 'exact'
+            pattern: 'auth_denial'
+      page_indexer:
+        - model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
+          enabled: True
+          type: 'transformers'
+          device: 'cuda'
+          name: 'page_indexer_patient'
+          filter:
+            type: 'regex'
+            pattern: '.*'
+  - pipeline:
+      default : false
+      name: 'jpmc-corr'
+      device: cuda
+      id2label:
+        '0': additional_information
+        '1': attorney
+        '2': auth_approval
+        '3': auth_denial
+        '4': bankruptcy
+        '5': cms_letter
+        '6': dispute
+        '7': eligibility
+        '8': medical_certificate
+        '9': medical_record
+        '10': misc
+        '11': newborn
+        '12': noop_blank
+        '13': noop_check
+        '14': noop_cover
+        '15': noop_envelope
+        '16': noop_eob
+        '17': noop_hicfa
+        '18': noop_notice
+        '19': noop_patpay
+        '20': noop_w9
+        '21': pa_162
+        '22': referral
+        '23': refund_request
+        '24': tax_1099
+
+      page_classifier:
+        - model_name_or_path: 'rms/corr-layoutlmv3-classifier'
+          name: 'corr_page_classifier_layoutlmv3'
+          type: 'transformers'
+          task: 'text-classification-multimodal'
+          device: 'cuda'
+          enabled: True
+          group: 'corr-classifier'
diff --git a/config/tests-integration/pipeline-integration-001.partial.yml b/config/tests-integration/pipeline-integration-001.partial.yml
@@ -0,0 +1,46 @@
+pipelines:
+  - pipeline:
+      name: 'default'
+      default: True
+      page_classifier:
+        - model_name_or_path: 'marie/lmv3-medical-document-classification'
+          type: 'transformers'
+          task: 'text-classification-multimodal'
+          device: 'cuda'
+          enabled: True
+          batch_size: 1
+          name: 'medical_page_classifier'
+          group: 'medical-classifier'
+
+        - model_name_or_path: 'marie/lmv3-medical-document-payer'
+          type: 'transformers'
+          task: 'text-classification-multimodal'
+          enabled: True
+          batch_size: 1
+          device: 'cuda'
+          name: 'medical_payer_classifier'
+          group: 'medical-payer-classifier'
+
+      page_indexer:
+        - model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
+          enabled: True
+          type: 'transformers'
+          device: 'cuda'
+          name: 'page_indexer_patient'
+          filter:
+            type: 'regex'
+            pattern: '.*'
+          group: 'medical-classifier'
+        - model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
+          enabled: True
+          type: 'transformers'
+          device: 'cuda'
+          name: 'page_indexer_payer'
+          filter:
+            type: 'regex'
+            pattern: '.*'
+          group: 'medical-payer-classifier'
+
+      page_splitter:
+        model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
+        enabled: True